From 879c98913c360b01f63588685c01ac06e83be54d Mon Sep 17 00:00:00 2001 From: Allan MacKinnon Date: Wed, 20 Jun 2018 08:29:07 -0700 Subject: Overdue reorg of source tree to support multiple platforms & devices. Bug: skia: Change-Id: I1248a529a932ed5ef32952a1bb7eca56ee1c5f25 Reviewed-on: https://skia-review.googlesource.com/136170 Reviewed-by: Mike Klein Commit-Queue: Mike Klein --- src/compute/skc/Makefile | 79 - src/compute/skc/allocator_device_cl.c | 136 - src/compute/skc/allocator_device_cl.h | 54 - src/compute/skc/atomic_cl.h | 72 - src/compute/skc/block_pool_cl.h | 60 - src/compute/skc/block_pool_cl_12.h | 33 - src/compute/skc/block_pool_init.cl | 64 - src/compute/skc/cl_20/extent.c | 787 ----- src/compute/skc/cl_20/extent.h | 390 --- src/compute/skc/cl_20/ring_cl_svm_fine.cpp | 89 - src/compute/skc/cl_20/ring_cl_svm_fine.h | 46 - src/compute/skc/common.h | 2 + src/compute/skc/composition_cl_12.c | 823 ----- src/compute/skc/composition_cl_12.h | 105 - src/compute/skc/config_cl.h | 147 - src/compute/skc/cq_pool_cl.c | 152 - src/compute/skc/cq_pool_cl.h | 46 - src/compute/skc/device_cl_12.h | 95 - src/compute/skc/device_cl_12_avx2.h | 60 - src/compute/skc/device_cl_12_gen9.c | 942 ------ src/compute/skc/device_cl_12_gen9.h | 335 -- src/compute/skc/export_cl_12.h | 63 - src/compute/skc/extent_cl_12.c | 459 --- src/compute/skc/extent_cl_12.h | 476 --- src/compute/skc/extent_cl_12_unified.c | 281 -- src/compute/skc/fills_expand.cl | 309 -- src/compute/skc/handle_pool_cl_12.c | 752 ----- src/compute/skc/handle_pool_cl_12.h | 177 - src/compute/skc/interop.c | 629 ---- src/compute/skc/interop.h | 42 - src/compute/skc/main.c | 20 +- src/compute/skc/make_all.bat | 15 - src/compute/skc/make_inl_cl.bat | 72 - src/compute/skc/path_builder_cl_12.c | 1443 --------- src/compute/skc/path_builder_cl_12.h | 44 - src/compute/skc/paths_copy.cl | 543 ---- src/compute/skc/paths_reclaim.cl | 390 --- src/compute/skc/place.cl | 871 ----- .../skc/platforms/cl_12/allocator_device_cl.c | 136 + .../skc/platforms/cl_12/allocator_device_cl.h | 54 + src/compute/skc/platforms/cl_12/atomic_cl.h | 72 + src/compute/skc/platforms/cl_12/block_pool_cl.h | 60 + src/compute/skc/platforms/cl_12/block_pool_cl_12.h | 33 + .../skc/platforms/cl_12/composition_cl_12.c | 823 +++++ .../skc/platforms/cl_12/composition_cl_12.h | 105 + src/compute/skc/platforms/cl_12/config_cl.h | 147 + src/compute/skc/platforms/cl_12/cq_pool_cl.c | 152 + src/compute/skc/platforms/cl_12/cq_pool_cl.h | 46 + src/compute/skc/platforms/cl_12/device_cl_12.h | 95 + src/compute/skc/platforms/cl_12/export_cl_12.h | 63 + src/compute/skc/platforms/cl_12/extent_cl_12.c | 459 +++ src/compute/skc/platforms/cl_12/extent_cl_12.h | 476 +++ .../skc/platforms/cl_12/extent_cl_12_unified.c | 281 ++ src/compute/skc/platforms/cl_12/gl/interop.c | 629 ++++ src/compute/skc/platforms/cl_12/gl/interop.h | 42 + .../skc/platforms/cl_12/handle_pool_cl_12.c | 752 +++++ .../skc/platforms/cl_12/handle_pool_cl_12.h | 177 + .../skc/platforms/cl_12/kernels/block_pool_init.cl | 64 + .../cl_12/kernels/devices/avx2/device_cl_12_avx2.h | 60 + .../cl_12/kernels/devices/gen9/device_cl_12.c | 938 ++++++ .../cl_12/kernels/devices/gen9/device_cl_12.h | 341 ++ .../cl_12/kernels/devices/gen9/inl/make_all.bat | 15 + .../cl_12/kernels/devices/gen9/inl/make_inl_cl.bat | 85 + .../skc/platforms/cl_12/kernels/fills_expand.cl | 309 ++ .../skc/platforms/cl_12/kernels/paths_copy.cl | 543 ++++ .../skc/platforms/cl_12/kernels/paths_reclaim.cl | 390 +++ src/compute/skc/platforms/cl_12/kernels/place.cl | 871 +++++ src/compute/skc/platforms/cl_12/kernels/prefix.cl | 1041 ++++++ .../skc/platforms/cl_12/kernels/rasterize.cl | 3366 +++++++++++++++++++ .../skc/platforms/cl_12/kernels/rasters_alloc.cl | 144 + .../skc/platforms/cl_12/kernels/rasters_reclaim.cl | 442 +++ src/compute/skc/platforms/cl_12/kernels/render.cl | 2165 +++++++++++++ .../skc/platforms/cl_12/kernels/segment_ttck.cl | 130 + .../skc/platforms/cl_12/kernels/segment_ttrk.cl | 394 +++ .../skc/platforms/cl_12/path_builder_cl_12.c | 1443 +++++++++ .../skc/platforms/cl_12/path_builder_cl_12.h | 44 + .../skc/platforms/cl_12/raster_builder_cl_12.c | 1349 ++++++++ .../skc/platforms/cl_12/raster_builder_cl_12.h | 165 + src/compute/skc/platforms/cl_12/runtime_cl.c | 362 +++ src/compute/skc/platforms/cl_12/runtime_cl.h | 79 + src/compute/skc/platforms/cl_12/runtime_cl_12.c | 314 ++ src/compute/skc/platforms/cl_12/runtime_cl_12.h | 177 + src/compute/skc/platforms/cl_12/styling_cl_12.c | 339 ++ src/compute/skc/platforms/cl_12/styling_cl_12.h | 73 + src/compute/skc/platforms/cl_12/surface_cl_12.h | 32 + .../skc/platforms/cl_12/surface_cl_12_buffer.c | 453 +++ src/compute/skc/prefix.cl | 1042 ------ src/compute/skc/raster_builder_cl_12.c | 1349 -------- src/compute/skc/raster_builder_cl_12.h | 165 - src/compute/skc/rasterize.cl | 3367 -------------------- src/compute/skc/rasters_alloc.cl | 144 - src/compute/skc/rasters_reclaim.cl | 442 --- src/compute/skc/render.cl | 2165 ------------- src/compute/skc/runtime_cl.c | 362 --- src/compute/skc/runtime_cl.h | 79 - src/compute/skc/runtime_cl_12.c | 314 -- src/compute/skc/runtime_cl_12.h | 177 - src/compute/skc/segment_ttck.cl | 131 - src/compute/skc/segment_ttrk.cl | 396 --- src/compute/skc/styling_cl_12.c | 339 -- src/compute/skc/styling_cl_12.h | 73 - src/compute/skc/surface_cl_12.h | 32 - src/compute/skc/surface_cl_12_buffer.c | 453 --- src/compute/skc/types.h | 34 +- 104 files changed, 20756 insertions(+), 22141 deletions(-) delete mode 100644 src/compute/skc/Makefile delete mode 100644 src/compute/skc/allocator_device_cl.c delete mode 100644 src/compute/skc/allocator_device_cl.h delete mode 100644 src/compute/skc/atomic_cl.h delete mode 100644 src/compute/skc/block_pool_cl.h delete mode 100644 src/compute/skc/block_pool_cl_12.h delete mode 100644 src/compute/skc/block_pool_init.cl delete mode 100644 src/compute/skc/cl_20/extent.c delete mode 100644 src/compute/skc/cl_20/extent.h delete mode 100644 src/compute/skc/cl_20/ring_cl_svm_fine.cpp delete mode 100644 src/compute/skc/cl_20/ring_cl_svm_fine.h delete mode 100644 src/compute/skc/composition_cl_12.c delete mode 100644 src/compute/skc/composition_cl_12.h delete mode 100644 src/compute/skc/config_cl.h delete mode 100644 src/compute/skc/cq_pool_cl.c delete mode 100644 src/compute/skc/cq_pool_cl.h delete mode 100644 src/compute/skc/device_cl_12.h delete mode 100644 src/compute/skc/device_cl_12_avx2.h delete mode 100644 src/compute/skc/device_cl_12_gen9.c delete mode 100644 src/compute/skc/device_cl_12_gen9.h delete mode 100644 src/compute/skc/export_cl_12.h delete mode 100644 src/compute/skc/extent_cl_12.c delete mode 100644 src/compute/skc/extent_cl_12.h delete mode 100644 src/compute/skc/extent_cl_12_unified.c delete mode 100644 src/compute/skc/fills_expand.cl delete mode 100644 src/compute/skc/handle_pool_cl_12.c delete mode 100644 src/compute/skc/handle_pool_cl_12.h delete mode 100644 src/compute/skc/interop.c delete mode 100644 src/compute/skc/interop.h delete mode 100644 src/compute/skc/make_all.bat delete mode 100644 src/compute/skc/make_inl_cl.bat delete mode 100644 src/compute/skc/path_builder_cl_12.c delete mode 100644 src/compute/skc/path_builder_cl_12.h delete mode 100644 src/compute/skc/paths_copy.cl delete mode 100644 src/compute/skc/paths_reclaim.cl delete mode 100644 src/compute/skc/place.cl create mode 100644 src/compute/skc/platforms/cl_12/allocator_device_cl.c create mode 100644 src/compute/skc/platforms/cl_12/allocator_device_cl.h create mode 100644 src/compute/skc/platforms/cl_12/atomic_cl.h create mode 100644 src/compute/skc/platforms/cl_12/block_pool_cl.h create mode 100644 src/compute/skc/platforms/cl_12/block_pool_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/composition_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/composition_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/config_cl.h create mode 100644 src/compute/skc/platforms/cl_12/cq_pool_cl.c create mode 100644 src/compute/skc/platforms/cl_12/cq_pool_cl.h create mode 100644 src/compute/skc/platforms/cl_12/device_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/export_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12_unified.c create mode 100644 src/compute/skc/platforms/cl_12/gl/interop.c create mode 100644 src/compute/skc/platforms/cl_12/gl/interop.h create mode 100644 src/compute/skc/platforms/cl_12/handle_pool_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/handle_pool_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat create mode 100644 src/compute/skc/platforms/cl_12/kernels/fills_expand.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/paths_copy.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/place.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/prefix.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasterize.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/render.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl create mode 100644 src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl create mode 100644 src/compute/skc/platforms/cl_12/path_builder_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/path_builder_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/raster_builder_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/raster_builder_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl.c create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl.h create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/styling_cl_12.c create mode 100644 src/compute/skc/platforms/cl_12/styling_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/surface_cl_12.h create mode 100644 src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c delete mode 100644 src/compute/skc/prefix.cl delete mode 100644 src/compute/skc/raster_builder_cl_12.c delete mode 100644 src/compute/skc/raster_builder_cl_12.h delete mode 100644 src/compute/skc/rasterize.cl delete mode 100644 src/compute/skc/rasters_alloc.cl delete mode 100644 src/compute/skc/rasters_reclaim.cl delete mode 100644 src/compute/skc/render.cl delete mode 100644 src/compute/skc/runtime_cl.c delete mode 100644 src/compute/skc/runtime_cl.h delete mode 100644 src/compute/skc/runtime_cl_12.c delete mode 100644 src/compute/skc/runtime_cl_12.h delete mode 100644 src/compute/skc/segment_ttck.cl delete mode 100644 src/compute/skc/segment_ttrk.cl delete mode 100644 src/compute/skc/styling_cl_12.c delete mode 100644 src/compute/skc/styling_cl_12.h delete mode 100644 src/compute/skc/surface_cl_12.h delete mode 100644 src/compute/skc/surface_cl_12_buffer.c (limited to 'src/compute') diff --git a/src/compute/skc/Makefile b/src/compute/skc/Makefile deleted file mode 100644 index e6516e3fd1..0000000000 --- a/src/compute/skc/Makefile +++ /dev/null @@ -1,79 +0,0 @@ -# -# Copyright 2016 Google Inc. -# -# Use of this source code is governed by a BSD-style license that can -# be found in the LICENSE file. -# - -SRC = block_pool_init.cl paths_copy.cl fills_expand.cl rasterize.cl raster_alloc.cl prefix.cl place.cl render.cl - -PRE = $(SRC:%.cl=%.pre.cl) - -IR_GEN9 = $(PRE:%.cl=%.ir) - -$(info PRE : $(PRE)) -$(info IR_GEN9 : $(IR_GEN9)) - -# -# -# - -OPENCL_STD = -cl-std=CL1.2 -OPENCL_PRE = __OPENCL_C_VERSION__=120 - -# OPENCL_STD = -cl-std=CL2.0 -# OPENCL_PRE = __OPENCL_C_VERSION__=200 - -# -# -# - -TARGETS = $(PRE) $(IR_GEN9) - -# -# -# - -IOC = ioc64 - -IOC_IR_OPTS_OPT = $(OPENCL_STD) -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable \ - -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info - -IOC_IR_OPTS_DBG = $(OPENCL_STD) -cl-kernel-arg-info -g - -IOC_IR_OPTS = $(IOC_IR_OPTS_OPT) - -# -# -# - -PRE_DEPS = $(wildcard *.h) - -# -# -# - -all: $(TARGETS) - - -clean: - -rm -f $(TARGETS) $(wildcard *.pre.bin.inl) $(wildcard *.pre.src.inl) $(wildcard *.gen) $(wildcard *.TMP) - -# -# PREPROCESS -# - -$(PRE): %.pre.cl: %.cl $(PRE_DEPS) - cl -I . -I "%INTELOCLSDKROOT%\include" -D $(OPENCL_PRE) -EP $< -P -Fi"$@" - clang-format -i $@ - dos2unix $@ - xxd -i $@ $(basename $@).src.inl - -# -# GEN9 -- supports OpenCL 2.0 and can emit SPIR-V / SPIR-V TEXT but cannot load it via clCreateProgramWithIL() -# - -$(IR_GEN9): %.ir: %.cl - touch $@ - $(IOC) -cmd=build -bo="$(IOC_IR_OPTS)" -device=gpu -input=$< -ir=$@ -asm - xxd -i $@ $(basename $@).bin.inl diff --git a/src/compute/skc/allocator_device_cl.c b/src/compute/skc/allocator_device_cl.c deleted file mode 100644 index aa44f36e87..0000000000 --- a/src/compute/skc/allocator_device_cl.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "runtime_cl_12.h" -#include "config_cl.h" -#include "common/cl/assert_cl.h" - -// -// PERM -// - -cl_mem -skc_runtime_device_perm_alloc(struct skc_runtime * const runtime, - cl_mem_flags const flags, - size_t const size) -{ - cl_int cl_err; - - cl_mem mem = clCreateBuffer(runtime->cl.context, - flags, - size, - NULL, - &cl_err); cl_ok(cl_err); - return mem; -} - -void -skc_runtime_device_perm_free(struct skc_runtime * const runtime, - cl_mem const mem) -{ - cl(ReleaseMemObject(mem)); -} - -// -// TEMP -// - -cl_mem -skc_runtime_device_temp_alloc(struct skc_runtime * const runtime, - cl_mem_flags const flags, - size_t const size, - skc_subbuf_id_t * const subbuf_id, - size_t * const subbuf_size) -{ - if (size == 0) - { - *subbuf_id = (skc_subbuf_id_t)-1; - - if (subbuf_size != NULL) - *subbuf_size = 0; - - return NULL; - } - - cl_buffer_region br; - - br.origin = skc_suballocator_subbuf_alloc(&runtime->allocator.device.temp.suballocator, - runtime->scheduler, - size,subbuf_id,&br.size); - - if (subbuf_size != NULL) - *subbuf_size = br.size; - - cl_int cl_err; - - cl_mem mem = clCreateSubBuffer(runtime->allocator.device.temp.extent, - flags, - CL_BUFFER_CREATE_TYPE_REGION, - &br, - &cl_err); cl_ok(cl_err); - - return mem; -} - - -void -skc_runtime_device_temp_free(struct skc_runtime * const runtime, - cl_mem const mem, - skc_subbuf_id_t const subbuf_id) -{ - if (mem == NULL) - return; - - skc_suballocator_subbuf_free(&runtime->allocator.device.temp.suballocator,subbuf_id); - - cl(ReleaseMemObject(mem)); -} - -// -// -// - -void -skc_allocator_device_create(struct skc_runtime * const runtime) -{ - skc_suballocator_create(runtime, - &runtime->allocator.device.temp.suballocator, - "DEVICE", - runtime->config->suballocator.device.subbufs, - runtime->cl.base_align, - runtime->config->suballocator.device.size); - -#ifndef NDEBUG -#pragma message("Get rid of CL_MEM_ALLOC_HOST_PTR as soon as the sorter is installed") - cl_mem_flags const flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; -#else - cl_mem_flags const flags = CL_MEM_READ_WRITE; -#endif - - runtime->allocator.device.temp.extent = - skc_runtime_device_perm_alloc(runtime, - flags, - runtime->config->suballocator.device.size); -} - -void -skc_allocator_device_dispose(struct skc_runtime * const runtime) -{ - skc_suballocator_dispose(runtime,&runtime->allocator.device.temp.suballocator); - - skc_runtime_device_perm_free(runtime,runtime->allocator.device.temp.extent); -} - -// -// -// - diff --git a/src/compute/skc/allocator_device_cl.h b/src/compute/skc/allocator_device_cl.h deleted file mode 100644 index 67d4e41398..0000000000 --- a/src/compute/skc/allocator_device_cl.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include - -// -// -// - -#include "suballocator.h" - -// -// -// - -struct skc_allocator_device -{ -#if 0 - struct { - - } perm; -#endif - - struct { - struct skc_suballocator suballocator; - cl_mem extent; - } temp; -}; - -// -// -// - -void -skc_allocator_device_create(struct skc_runtime * const runtime); - -void -skc_allocator_device_dispose(struct skc_runtime * const runtime); - -// -// -// - diff --git a/src/compute/skc/atomic_cl.h b/src/compute/skc/atomic_cl.h deleted file mode 100644 index c196c36390..0000000000 --- a/src/compute/skc/atomic_cl.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_ONCE_ATOMIC_CL -#define SKC_ONCE_ATOMIC_CL - -// -// git cl upload is bleating about needing an #include before and #if -// so we're unneccesarily reloading the types and OpenCL header -// - -#include "types.h" - -#if (__OPENCL_C_VERSION__ <= 120 /*CL_VERSION_1_2*/) - -#define SKC_ATOMIC_UINT uint -#define SKC_ATOMIC_INT int - -#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v) atomic_add(p,v) -#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v) - -#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v) atomic_add(p,v) -#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v) - -#else // __OPENCL_C_VERSION__ > __CL_VERSION_1_2 - -// -// REMOVE THESE DEFINES ASAP -- ONLY HERE BECAUSE THE INTEL CODE -// BUILDER UTILITY DOESN'T SUPPORT CREATING AN ATOMIC TYPE BUFFER -// - -#ifdef SKC_SUPPORT_BROKEN_INTEL_CODE_BUILDER - -#define SKC_ATOMIC_UINT uint -#define SKC_ATOMIC_CAST_LOCAL(p) (__local atomic_uint volatile * restrict const)(p) -#define SKC_ATOMIC_CAST_GLOBAL(p) (__global atomic_uint volatile * restrict const)(p) - -#else - -#define SKC_ATOMIC_UINT atomic_uint -#define SKC_ATOMIC_CAST_LOCAL(p) (p) -#define SKC_ATOMIC_CAST_GLOBAL(p) (p) - -#endif - - -#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \ - v,memory_order_relaxed,memory_scope_device) -#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \ - v,memory_order_relaxed,memory_scope_sub_group) - -#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \ - v,memory_order_relaxed,memory_scope_device) -#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \ - v,memory_order_relaxed,memory_scope_sub_group) - -#endif - -// -// -// - -#endif // SKC_ONCE_ATOMIC_CL - -// -// -// diff --git a/src/compute/skc/block_pool_cl.h b/src/compute/skc/block_pool_cl.h deleted file mode 100644 index c88370919e..0000000000 --- a/src/compute/skc/block_pool_cl.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_ONCE_BLOCK_POOL -#define SKC_ONCE_BLOCK_POOL - -// -// -// - -#include "types.h" - -// -// -// - -union skc_block_pool_size -{ - skc_uint3 u32v3; - - struct { - skc_uint pool_size; // number of blocks - skc_uint ring_pow2; // rounded-up pow2 of pool_size - skc_uint ring_mask; // ring_pow2 - 1 - }; -}; - -// -// -// - -union skc_block_pool_atomic -{ - skc_uint2 u32v2; - - skc_uint u32a2[2]; - - struct { - skc_uint reads; - skc_uint writes; - }; -}; - -#define SKC_BP_ATOMIC_OFFSET_READS 0 -#define SKC_BP_ATOMIC_OFFSET_WRITES 1 - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/block_pool_cl_12.h b/src/compute/skc/block_pool_cl_12.h deleted file mode 100644 index 6fa8a39ca0..0000000000 --- a/src/compute/skc/block_pool_cl_12.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "block_pool_cl.h" -#include "extent_cl_12.h" - -// -// device side block pool -// - -struct skc_block_pool -{ - union skc_block_pool_size const * size; - - struct skc_extent_pdrw blocks; - struct skc_extent_pdrw ids; - struct skc_extent_phr_pdrw atomics; -}; - -// -// -// diff --git a/src/compute/skc/block_pool_init.cl b/src/compute/skc/block_pool_init.cl deleted file mode 100644 index 023dff44cf..0000000000 --- a/src/compute/skc/block_pool_init.cl +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" - -// -// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ -// - -__kernel -SKC_BP_INIT_IDS_KERNEL_ATTRIBS -void -skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size) -{ - uint const gid = get_global_id(0); - - // - // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to - // accomplish this with fewer threads and using either IPC and/or - // vector stores -- it should be on certain architectures! - // - - // - // initialize pool with sequence - // - if (gid < bp_size) - ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK; -} - -// -// -// - -__kernel -SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS -void -skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size) -{ - // the version test is to squelch a bug with the Intel OpenCL CPU - // compiler declaring it supports the cl_intel_subgroups extension -#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups) - uint const tid = get_sub_group_local_id(); -#else - uint const tid = get_local_id(0); -#endif - - // - // launch two threads and store [ 0, bp_size ] - // - bp_atomics[tid] = tid * bp_size; -} - -// -// -// diff --git a/src/compute/skc/cl_20/extent.c b/src/compute/skc/cl_20/extent.c deleted file mode 100644 index 4c073e8b69..0000000000 --- a/src/compute/skc/cl_20/extent.c +++ /dev/null @@ -1,787 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include - -// #include "extent.h" - -// -// EXTENT TYPES -// -// Classification of operations on allocated GPU memory -// -// h = host -// d = device -// -// c = append using non-atomic incremented count -// x = append using atomically incremented index -// p = allocated from pool of indices -// g = gathered by pull kernel -// s = size is available -// -// w1 = write once -// wN = write many -// -// r1 = read once -// rN = read many -// -// rw = read/write many -// -// host<>device memory model -// +--------------------+--------------------+ -// extent type | split | shared | examples -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_atomic | device+mapped | device+mapped | atomically op'd device extent + read-only host snapshot -// | | | -// extent_dxrw | device | device | ttsk_array, ttpk_array, ttck_array, *_offsets -// extent_hcw1_dr1 | mapped | mapped | command_queue, buffer -// extent_hcrw | host | host | queue -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_hcw1_drN | memcpy'd | mapped | stack_transforms, stack_stroke_props -// extent_hgw1_drN | scatter/gather | mapped | layer_props -// | | | -// block_pool_dprw | device | device | ttsb_pool, ttpb_pool -// block_pool_hp_drw | device | device | raster_pool -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// block_pool_hp_drw | block_pool_hp_drw | block_pool_hp_drw | path_block_pool -// staging buffer | extent_hw_dr | -- | -// | | | -// - -// -// HIGH-LEVEL EXTENTS ARE BUILT FROM SIMPLER STRUCTURES -// - -// -// COUNTERS FOR POOLS -- TYPICALLY ATOMIC WHEN ON DEVICE -// - -union skc_ring -{ - skc_uint2 u32v2; - - skc_uint u32a2[2]; - - struct { - skc_uint reads; // number of reads - skc_uint writes; // number of writes - }; -}; - -// -// POOL OF INDICES TO BLOCKS -// - -struct skc_pool_h -{ - skc_uint * indices; -}; - -struct skc_pool_d -{ - cl_mem * indices; // FIXME -- READ POOL INDICES THROUGH CONSTANT CACHE? -}; - -// -// LOW-LEVEL EXTENTS -- SIZES ARE STORED ELSEWHERE -// - -struct skc_extent_hrw -{ - void * hrw; // host pointer to host extent -- read/write -}; - -struct skc_extent_drw -{ - cl_mem drw; // device pointer to device extent -- read/write -}; - -struct skc_extent_hw_dr -{ - void * hw; // host pointer to shared extent -- write-only + write-combined - cl_mem dr; // device pointer to shared extent -- read-only -}; - -// -// -// - -#if 0 -static -void * -skc_runtime_svm_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size) -{ - return clSVMAlloc(runtime_cl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - 0); -} - -static -void * -skc_runtime_svm_atomic_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size) // WE DON'T NEED THIS HERE -{ - return clSVMAlloc(runtime_cl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, - size, - 0); -} - -static -void -skc_runtime_svm_free(struct skc_runtime_cl * const runtime_cl, void * const buffer) -{ - clSVMFree(runtime_cl->context,buffer); -} -#endif - -// -// -// - -void -skc_command_queue_fill_device(struct skc_command_queue * const cq, - cl_mem buffer, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -void * -skc_command_queue_map_wi(struct skc_command_queue * const cq, - cl_mem buffer); - -void -skc_command_queue_unmap(struct skc_command_queue * const cq, - cl_mem buffer, - void * const mapped); - -void -skc_command_queue_read(struct skc_command_queue * const cq, - cl_mem buffer, - void * const ptr); - -// -// -// - -struct skc_extent_hrw * -skc_extent_hrw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hrw = skc_allocator_alloc_host(allocator,size); - - return extent; -} - - - -void -skc_extent_hrw_free(struct skc_allocator * const allocator, - struct skc_extent_hrw * const extent) -{ - skc_allocator_free_host(allocator,extent->hrw); - skc_allocator_free_host(allocator,extent); -} - -// -// -// - -struct skc_extent_drw * -skc_extent_drw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_drw * extent; - - extent = skc_allocator_alloc_host (allocator,sizeof(*extent)); - extent->drw = skc_allocator_alloc_device(allocator,size); - - return extent; -} - -void -skc_extent_drw_free(struct skc_allocator * const allocator, - struct skc_extent_drw * const extent) -{ - skc_allocator_free_device(allocator,extent->drw); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size) -{ - skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size); -} - -// -// WRITE-COMBINED / WRITE-INVALIDATE -// - -struct skc_extent_hw_dr * -skc_extent_hw_dr_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hw_dr * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw = NULL; - extent->dr = skc_allocator_alloc_device_wc(allocator,size); // write-combined mem - - return extent; -} - -void -skc_extent_hw_dr_free(struct skc_allocator * const allocator, - struct skc_extent_hw_dr * const extent) -{ - skc_allocator_free_device(allocator,extent->dr); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_hw_dr_map(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent) -{ - extent->hw = skc_command_queue_map_wi(cq,extent->dr); -} - -void -skc_extent_hw_dr_unmap(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent) -{ - skc_command_queue_unmap(cq,extent->dr,extent->hw); -} - -void -skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent, - void const * SKC_RESTRICT const src, - size_t const offset, - size_t const size) -{ - void * SKC_RESTRICT const dst = (char *)extent->hw + offset; - - memcpy(dst,src,size); -} -// -// SNAPSHOT -// - -struct skc_extent_hr_drw -{ - void * hr; // host pointer to shared extent -- readable snapshot - cl_mem drw; // device pointer to shared extent -- read/write -}; - -struct skc_extent_hr_drw * -skc_extent_hr_drw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hr_drw * extent; - - extent = skc_allocator_alloc_host (allocator,sizeof(*extent)); - extent->hr = skc_allocator_alloc_host (allocator,size); - extent->drw = skc_allocator_alloc_device(allocator,size); - - return extent; -} - -void -skc_extent_hr_drw_free(struct skc_allocator * const allocator, - struct skc_extent_hr_drw * const extent) -{ - skc_allocator_free_host (allocator,extent->hr); - skc_allocator_free_device(allocator,extent->drw); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_hr_drw_snap(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - size_t const size) -{ - skc_command_queue_read(cq,extent->drw,extent->hr); -} - -void -skc_extent_hr_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size) -{ - skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size); -} - -// -// -// - -struct skc_extent_atomic -{ - struct skc_extent_hr_drw * hr_drw; - size_t size; // typically a very small extent -}; - -// -// -// - -struct skc_extent_atomic * -skc_extent_atomic_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_atomic * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hr_drw = skc_extent_hr_drw_alloc(allocator,size); - extent->size = size; - - return extent; -} - -void -skc_extent_atomic_free(struct skc_allocator * const allocator, - struct skc_extent_atomic * const extent) -{ - skc_extent_hr_drw_free (allocator,extent->hr_drw); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_atomic_snap(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent) -{ - skc_extent_hr_drw_snap(cq,extent->hr_drw,extent->size); -} - -void -skc_extent_atomic_zero(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent) -{ - skc_uint const zero = 0; - - skc_extent_hr_drw_fill(cq,extent->hr_drw,&zero,sizeof(zero),extent->size); -} - -// -// -// - -struct skc_extent_dxrw -{ - struct skc_extent_drw * drw; - - size_t elem_size; - skc_uint elem_count; - -#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED - struct skc_extent_atomic * atomic; - size_t atomic_offset; -#endif -}; - -// -// -// - -struct skc_extent_dxrw * -skc_extent_dxrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count, - struct skc_extent_atomic * const atomic, - size_t const atomic_offset) -{ - struct skc_extent_dxrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->drw = skc_extent_drw_alloc(allocator,elem_size * elem_count); - - extent->elem_size = elem_size; - extent->elem_count = elem_count; - - // - // note that passing in the atomic and its member has no real use at - // this point since the current programming style requires passing - // in the atomic extent -- which may have multiple members -- to the - // compute kernel - // -#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED - extent->atomic = atomic; - extent->atomic_offset = atomic_offset; -#endif - - return extent; -} - -void -skc_extent_dxrw_free(struct skc_allocator * const allocator, - struct skc_extent_dxrw * const extent) -{ - skc_extent_drw_free (allocator,extent->drw); - skc_allocator_free_host(allocator,extent); -} - -// -// -// - -struct skc_extent_hcrw -{ - struct skc_extent_hrw * hrw; - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcrw * -skc_extent_hcrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hrw = skc_extent_hrw_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcrw_free(struct skc_allocator * const allocator, - struct skc_extent_hcrw * const extent) -{ - skc_extent_hrw_free (allocator,extent->hrw); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent) -{ - return (extent->counter == extent->elem_count); -} - -// -// -// - -struct skc_extent_hcw1_dr1 -{ - struct skc_extent_hw_dr * hw_dr; // mapped memory - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcw1_dr1 * -skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcw1_dr1 * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw_dr = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcw1_dr1_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_free (allocator,extent->hw_dr); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcw1_dr1_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_map(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_dr1_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_unmap(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent) -{ - return (extent->counter == extent->elem_count); -} - -skc_uint -skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent) -{ - return extent->elem_count - extent->counter; -} - -void -skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped) -{ - skc_extent_hw_dr_memcpy(extent->hw_dr, - elem_ptr, - extent->elem_size * extent->counter, - extent->elem_size * elem_count_clamped); -} - -// -// -// - -struct skc_extent_hcw1_drN_unified -{ - struct skc_extent_hw_dr * hw_dr; // mapped memory - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcw1_drN_unified * -skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcw1_drN_unified * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw_dr = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_free (allocator,extent->hw_dr); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcw1_drN_unified_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_map(cq,extent->hw_dr); -} - - -void -skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_unmap(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent) -{ - return (extent->counter == extent->elem_count); -} - - -skc_uint -skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent) -{ - return extent->elem_count - extent->counter; -} - - -void -skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped) -{ - skc_extent_hw_dr_memcpy(extent->hw_dr, - elem_ptr, - extent->elem_size * extent->counter, - extent->elem_size * elem_count_clamped); -} - -// -// -// - -struct skc_id_pool_hp * -skc_id_pool_hp_alloc(struct skc_allocator * const allocator, - skc_uint const count) -{ - return NULL; -} - -void -skc_id_pool_hp_free(struct skc_allocator * const allocator, - struct skc_id_pool_hp * const extent) -{ - ; -} - -void -skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, - skc_uint * const id) -{ - ; -} - -void -skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, - skc_uint const id) -{ - ; -} - -void -skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, - skc_uint const * const id, - skc_uint const count) -{ - ; -} - -// -// -// - -struct skc_block_pool_dprw * -skc_block_pool_dprw_alloc(struct skc_allocator * const allocator, - union skc_ring * const ring_d, - skc_uint const block_size, - skc_uint const block_count) -{ - return NULL; -} - -void -skc_block_pool_dprw_free(struct skc_allocator * const allocator, - struct skc_block_pool_dprw * const extent) -{ - ; -} - -// -// -// - -struct skc_extent_hgw1_drN * -skc_extent_hgw1_drN_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - return NULL; -} - -void -skc_extent_hgw1_drN_free(struct skc_allocator * const allocator, - struct skc_extent_hgw1_drN * const extent) -{ - ; -} - -void -skc_extent_hgw1_drN_reset(struct skc_extent_hgw1_drN * const extent) -{ - ; -} - -void -skc_extent_hgw1_drN_snap(struct skc_command_queue * const cq, - struct skc_extent_hgw1_drN const * const extent) -{ - ; -} - -// -// -// - -#if 0 - -// -// -// - -struct skc_block_pool_hp_drw * -skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - return NULL; -} - -void -skc_block_pool_hp_drw_free(struct skc_allocator * const allocator, - struct skc_block_pool_hp_drw * const extent) -{ - ; -} - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/cl_20/extent.h b/src/compute/skc/cl_20/extent.h deleted file mode 100644 index 2993968a50..0000000000 --- a/src/compute/skc/cl_20/extent.h +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "skc.h" -#include "allocator.h" - -// -// EXTENT TYPES -// -// Classification of operations on allocated GPU memory -// -// h = host -// d = device -// -// c = append using non-atomic incremented count -// x = append using atomically incremented index -// p = allocated from pool of indices -// g = gathered by pull kernel -// -// w1 = write once -// wN = write many -// -// r1 = read once -// rN = read many -// -// rw = read/write many -// -// host<>device memory model -// +--------------------+--------------------+ -// extent type | split | shared | examples -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_atomic | device+mapped | device+mapped | atomically op'd device extent + read-only host snapshot -// | | | -// extent_dxrw | device | device | ttsk_array, ttpk_array, ttck_array, *_offsets -// extent_hcw1_dr1 | mapped | mapped | command_queue, buffer -// extent_hcrw | host | host | queue -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_hcw1_drN | memcpy'd | mapped | stack_transforms, stack_stroke_props -// extent_hgw1_drN | scatter/gather | mapped | layer_props -// | | | -// block_pool_dprw | device | device | ttsb_pool, ttpb_pool -// block_pool_hp_drw | device | device | raster_pool -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// block_pool_hp_drw | block_pool_hp_drw | block_pool_hp_drw | path_block_pool -// staging buffer | extent_hw_dr | -- | -// | | | -// - -struct skc_extent_hrw; -struct skc_extent_drw; - -struct skc_extent_hrw_drN; -struct skc_extent_hw1_drN; -struct skc_extent_hrN_drw; - -struct skc_extent_atomic; - -struct skc_extent_hcrw; -struct skc_extent_dxrw; - -struct skc_block_pool_dprw; - -struct skc_id_pool_hp; - -struct skc_extent_hcw1_dr1; -struct skc_extent_hcw1_drN; -struct skc_extent_hgw1_drN; - -// -// -// - -void * -skc_extent_hrw_drN_get_hrw(struct skc_extent_hrw_drN * extent); - -void * -skc_extent_hw1_drN_get_hw1(struct skc_extent_hw1_drN * extent); - -// -// -// - -struct skc_extent_hrw * -skc_extent_hrw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hrw_free(struct skc_allocator * const allocator, - struct skc_extent_hrw * const extent); - -void * -skc_extent_hrw_get_hrw(struct skc_extent_hrw * extent); - -// -// -// - -struct skc_extent_drw * -skc_extent_drw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_drw_free(struct skc_allocator * const allocator, - struct skc_extent_drw * const extent); - -void -skc_extent_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -// -// -// - -struct skc_extent_hw_dr * -skc_extent_hw_dr_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hw_dr_free(struct skc_allocator * const allocator, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_map(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_unmap(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent, - void const * SKC_RESTRICT const src, - size_t const offset, - size_t const size); -// -// -// - -struct skc_extent_hr_drw * -skc_extent_hr_drw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hr_drw_free(struct skc_allocator * const allocator, - struct skc_extent_hr_drw * const extent); - -void -skc_extent_hr_drw_snap(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - size_t const size); - -void -skc_extent_hr_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -// -// -// - -struct skc_extent_atomic * -skc_extent_atomic_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_atomic_free(struct skc_allocator * const allocator, - struct skc_extent_atomic * const extent); - -void -skc_extent_atomic_snap(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent); - -void -skc_extent_atomic_zero(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent); - -// -// -// - - -struct skc_extent_dxrw * -skc_extent_dxrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count, - struct skc_extent_atomic * const atomic, - size_t const atomic_offset); - -void -skc_extent_dxrw_free(struct skc_allocator * const allocator, - struct skc_extent_dxrw * const extent); - -// -// -// - -struct skc_extent_hcrw * -skc_extent_hcrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcrw_free(struct skc_allocator * const allocator, - struct skc_extent_hcrw * const extent); - -void -skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent); - -skc_bool -skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent); - -// -// -// - -struct skc_extent_hcw1_dr1 * -skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcw1_dr1_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent); - -skc_bool -skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent); - -skc_uint -skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped); - -// -// Note: on a shared memory device this reuses the hcw1_dr1 -// implementation and unmaps the extent instead of copying -// - -struct skc_extent_hcw1_drN_unified * -skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent); - -skc_bool -skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent); - -skc_uint -skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped); -// -// -// - -struct skc_id_pool_hp * -skc_id_pool_hp_alloc(struct skc_allocator * const allocator, - skc_uint const count); - -void -skc_id_pool_hp_free(struct skc_allocator * const allocator, - struct skc_id_pool_hp * const extent); - -void -skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, - skc_uint * const id); - -void -skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, - skc_uint const id); - -void -skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, - skc_uint const * const id, - skc_uint const count); - -// -// -// - -struct skc_block_pool_dprw * -skc_block_pool_dprw_alloc(struct skc_allocator * const allocator, - union skc_ring * const ring_d, - skc_uint const block_size, - skc_uint const block_count); - -void -skc_block_pool_dprw_free(struct skc_allocator * const allocator, - struct skc_block_pool_dprw * const extent); - -// -// -// - -struct skc_extent_hgw1_drN_unified * -skc_extent_hgw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hgw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hgw1_drN_unified * const extent); - -void -skc_extent_hgw1_drN_unified_reset(struct skc_extent_hgw1_drN_unified * const extent); - -void -skc_extent_hgw1_drN_unified_snap(struct skc_command_queue * const cq, - struct skc_extent_hgw1_drN_unified const * const extent); - -// -// -// - -#if 0 - -// -// -// - -struct skc_block_pool_hp_drw * -skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_block_pool_hp_drw_free(struct skc_allocator * const allocator, - struct skc_block_pool_hp_drw * const extent); - -// -// -// - -#endif - -// -// -// - diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp b/src/compute/skc/cl_20/ring_cl_svm_fine.cpp deleted file mode 100644 index 9552c81f2d..0000000000 --- a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// Fine-grained shared virtual memory ring -// -// There is limited support for C11 atomics in C compilers so -// implement this module in C++11 -// - -extern "C" { - -#include "runtime.h" -#include "ring_cl_svm_fine.h" - -} - -// -// -// - -#include - -// -// -// - -union skc_ring -{ - std::atomic rw[2]; - - struct { - std::atomic reads; // number of reads - std::atomic writes; // number of writes - }; -}; - -// -// -// - -union skc_ring * -skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl) -{ - return (union skc_ring *) - clSVMAlloc(runtime_impl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, - sizeof(union skc_ring), - 0); -} - -void -skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes) -{ - ring->reads = ATOMIC_VAR_INIT(0); - ring->writes = ATOMIC_VAR_INIT(writes); -} - -void -skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring) -{ - clSVMFree(runtime_impl->context,ring); -} - -// -// -// - -skc_uint -skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n) -{ - return atomic_fetch_add_explicit(&ring->reads,n,std::memory_order_relaxed); -} - -skc_uint -skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n) -{ - return atomic_fetch_add_explicit(&ring->writes,n,std::memory_order_relaxed); -} - -// -// -// - diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.h b/src/compute/skc/cl_20/ring_cl_svm_fine.h deleted file mode 100644 index 65ff9f71f3..0000000000 --- a/src/compute/skc/cl_20/ring_cl_svm_fine.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// Fine-grained shared virtual memory ring -// - -#include "runtime.h" -#include "types.h" - -// -// -// - -union skc_ring * -skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl); - -void -skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring); - -// -// -// - -void -skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes); - -// -// -// - -skc_uint -skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n); - -skc_uint -skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n); - -// -// -// - diff --git a/src/compute/skc/common.h b/src/compute/skc/common.h index 618ba2242e..5ac42ab2dc 100644 --- a/src/compute/skc/common.h +++ b/src/compute/skc/common.h @@ -9,6 +9,8 @@ #ifndef SKC_COMMON_ONCE #define SKC_COMMON_ONCE +#include "types.h" + // // structures common to both host and device -- placeholder until // everything shakes out diff --git a/src/compute/skc/composition_cl_12.c b/src/compute/skc/composition_cl_12.c deleted file mode 100644 index 7853564636..0000000000 --- a/src/compute/skc/composition_cl_12.c +++ /dev/null @@ -1,823 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include - -#include "hs/cl/hs_cl_launcher.h" - -#include "common/cl/assert_cl.h" - -#include "composition_cl_12.h" -#include "config_cl.h" - -#include "context.h" -#include "raster.h" -#include "handle.h" - -#include "runtime_cl_12.h" - -#include "common.h" -#include "tile.h" - -// -// TTCK (32-BIT COMPARE) v1: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 18 | 7 | 7 | -// -// -// TTCK (32-BIT COMPARE) v2: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 15 | 9 | 8 | -// -// -// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 27 | 1 | 1 | 18 | 9 | 8 | -// - -union skc_ttck -{ - skc_ulong u64; - skc_uint2 u32v2; - - struct { - skc_uint id : SKC_TTCK_LO_BITS_ID; - skc_uint prefix : SKC_TTCK_LO_BITS_PREFIX; - skc_uint escape : SKC_TTCK_LO_BITS_ESCAPE; - skc_uint layer_lo : SKC_TTCK_LO_BITS_LAYER; - skc_uint layer_hi : SKC_TTCK_HI_BITS_LAYER; - skc_uint x : SKC_TTCK_HI_BITS_X; - skc_uint y : SKC_TTCK_HI_BITS_Y; - }; - - struct { - skc_ulong na0 : SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE; - skc_ulong layer : SKC_TTCK_BITS_LAYER; - skc_ulong na1 : SKC_TTCK_HI_BITS_YX; - }; - - struct { - skc_uint na2; - skc_uint na3 : SKC_TTCK_HI_BITS_LAYER; - skc_uint yx : SKC_TTCK_HI_BITS_YX; - }; -}; - -// -// FIXME -- accept floats on host but convert to subpixel offsets -// before appending to command ring -// - -#define SKC_PLACE_CMD_TX_CONVERT(f) 0 -#define SKC_PLACE_CMD_TY_CONVERT(f) 0 - -// -// COMPOSITION PLACE -// -// This is a snapshot of the host-side command queue. -// -// Note that the composition command extent could be implemented as -// either a mapped buffer or simply copied to an ephemeral extent. -// -// This implementation may vary between compute platforms. -// - -struct skc_composition_place -{ - struct skc_composition_impl * impl; - - cl_command_queue cq; - - struct skc_extent_phw1g_tdrNs_snap cmds; - - skc_subbuf_id_t id; -}; - -// -// Forward declarations -// - -static -void -skc_composition_unseal_block(struct skc_composition_impl * const impl, - skc_bool const block); - -// -// -// - -static -void -skc_composition_pfn_release(struct skc_composition_impl * const impl) -{ - if (--impl->composition->ref_count != 0) - return; - - // - // otherwise, dispose of all resources - // - - // the unsealed state is a safe state to dispose of resources - skc_composition_unseal_block(impl,true); // block - - struct skc_runtime * const runtime = impl->runtime; - - // free host composition - skc_runtime_host_perm_free(runtime,impl->composition); - - // release the cq - skc_runtime_release_cq_in_order(runtime,impl->cq); - - // release kernels - cl(ReleaseKernel(impl->kernels.place)); - cl(ReleaseKernel(impl->kernels.segment)); - - // release extents - skc_extent_phw1g_tdrNs_free(runtime,&impl->cmds.extent); - skc_extent_phrw_free (runtime,&impl->saved.extent); - skc_extent_phr_pdrw_free (runtime,&impl->atomics); - - skc_extent_pdrw_free (runtime,&impl->keys); - skc_extent_pdrw_free (runtime,&impl->offsets); - - // free composition impl - skc_runtime_host_perm_free(runtime,impl); -} - -// -// -// - -static -void -skc_composition_place_grid_pfn_dispose(skc_grid_t const grid) -{ - struct skc_composition_place * const place = skc_grid_get_data(grid); - struct skc_composition_impl * const impl = place->impl; - struct skc_runtime * const runtime = impl->runtime; - - // release cq - skc_runtime_release_cq_in_order(runtime,place->cq); - - // unmap the snapshot (could be a copy) - skc_extent_phw1g_tdrNs_snap_free(runtime,&place->cmds); - - // release place struct - skc_runtime_host_temp_free(runtime,place,place->id); - - // release impl - skc_composition_pfn_release(impl); -} - -// -// -// - -static -void -skc_composition_place_read_complete(skc_grid_t const grid) -{ - skc_grid_complete(grid); -} - -static -void -skc_composition_place_read_cb(cl_event event, cl_int status, skc_grid_t const grid) -{ - SKC_CL_CB(status); - - struct skc_composition_place * const place = skc_grid_get_data(grid); - struct skc_composition_impl * const impl = place->impl; - struct skc_runtime * const runtime = impl->runtime; - struct skc_scheduler * const scheduler = runtime->scheduler; - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(scheduler,skc_composition_place_read_complete,grid); -} - -static -void -skc_composition_place_grid_pfn_execute(skc_grid_t const grid) -{ - // - // FILLS EXPAND - // - // need result of cmd counts before launching RASTERIZE grids - // - // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host - // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device - // - or launch a device-wide grid that feeds itself but that's unsatisfying - // - struct skc_composition_place * const place = skc_grid_get_data(grid); - struct skc_composition_impl * const impl = place->impl; - struct skc_runtime * const runtime = impl->runtime; - - skc_uint const work_size = skc_extent_ring_snap_count(place->cmds.snap); - skc_uint4 const clip = { 0, 0, SKC_UINT_MAX, SKC_UINT_MAX }; - - // initialize kernel args - cl(SetKernelArg(impl->kernels.place,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); - cl(SetKernelArg(impl->kernels.place,1,SKC_CL_ARG(impl->atomics.drw))); - cl(SetKernelArg(impl->kernels.place,2,SKC_CL_ARG(impl->keys.drw))); - cl(SetKernelArg(impl->kernels.place,3,SKC_CL_ARG(place->cmds.drN))); - cl(SetKernelArg(impl->kernels.place,4,SKC_CL_ARG(runtime->handle_pool.map.drw))); - cl(SetKernelArg(impl->kernels.place,5,SKC_CL_ARG(clip))); // FIXME -- convert the clip to yx0/yx1 format - cl(SetKernelArg(impl->kernels.place,6,SKC_CL_ARG(work_size))); - - // launch kernel - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_PLACE, - place->cq, - impl->kernels.place, - work_size, - 0,NULL,NULL); - // - // copy atomics back after every place launch - // - cl_event complete; - - skc_extent_phr_pdrw_read(&impl->atomics,place->cq,&complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_place_read_cb,grid)); - cl(ReleaseEvent(complete)); - - // flush command queue - cl(Flush(place->cq)); -} - -// -// -// - -static -void -skc_composition_snap(struct skc_composition_impl * const impl) -{ - skc_composition_retain(impl->composition); - - skc_subbuf_id_t id; - - struct skc_composition_place * const place = skc_runtime_host_temp_alloc(impl->runtime, - SKC_MEM_FLAGS_READ_WRITE, - sizeof(*place),&id,NULL); - - // save the subbuf id - place->id = id; - - // save backpointer - place->impl = impl; - - // set grid data - skc_grid_set_data(impl->grids.place,place); - - // acquire command queue - place->cq = skc_runtime_acquire_cq_in_order(impl->runtime); - - // checkpoint the ring - skc_extent_ring_checkpoint(&impl->cmds.ring); - - // make a snapshot - skc_extent_phw1g_tdrNs_snap_init(impl->runtime,&impl->cmds.ring,&place->cmds); - - // unmap the snapshot (could be a copy) - skc_extent_phw1g_tdrNs_snap_alloc(impl->runtime, - &impl->cmds.extent, - &place->cmds, - place->cq, - NULL); - - skc_grid_force(impl->grids.place); -} - -// -// -// - -static -void -skc_composition_pfn_seal(struct skc_composition_impl * const impl) -{ - // return if sealing or sealed - if (impl->state >= SKC_COMPOSITION_STATE_SEALING) - return; - - struct skc_runtime * const runtime = impl->runtime; - struct skc_scheduler * const scheduler = runtime->scheduler; - - // - // otherwise, wait for UNSEALING > UNSEALED transition - // - if (impl->state == SKC_COMPOSITION_STATE_UNSEALING) - { - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED); - } - else // or we were already unsealed - { - // flush is there is work in progress - skc_uint const count = skc_extent_ring_wip_count(&impl->cmds.ring); - - if (count > 0) { - skc_composition_snap(impl); - } - } - - // - // now unsealed so we need to start sealing... - // - impl->state = SKC_COMPOSITION_STATE_SEALING; - - // - // the seal operation implies we should force start all dependencies - // that are still in a ready state - // - skc_grid_force(impl->grids.sort); -} - -// -// -// - -void -skc_composition_sort_execute_complete(struct skc_composition_impl * const impl) -{ - // we're sealed - impl->state = SKC_COMPOSITION_STATE_SEALED; - - // this grid is done - skc_grid_complete(impl->grids.sort); -} - -static -void -skc_composition_sort_execute_cb(cl_event event, cl_int status, struct skc_composition_impl * const impl) -{ - SKC_CL_CB(status); - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_composition_sort_execute_complete,impl); -} - -static -void -skc_composition_sort_grid_pfn_execute(skc_grid_t const grid) -{ - struct skc_composition_impl * const impl = skc_grid_get_data(grid); - - // we should be sealing - assert(impl->state == SKC_COMPOSITION_STATE_SEALING); - - struct skc_place_atomics * const atomics = impl->atomics.hr; - -#ifndef NDEBUG - fprintf(stderr,"composition sort: %u\n",atomics->keys); -#endif - - if (atomics->keys > 0) - { - uint32_t keys_padded_in, keys_padded_out; - - hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); - - hs_sort(impl->cq, - impl->keys.drw, - impl->keys.drw, - atomics->keys, - keys_padded_in, - keys_padded_out, - false); - - cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw))); - cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw))); - cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw))); - - // find start of each tile - skc_device_enqueue_kernel(impl->runtime->device, - SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK, - impl->cq, - impl->kernels.segment, - atomics->keys, - 0,NULL,NULL); - } - - cl_event complete; - - // next stage needs to know number of key segments - skc_extent_phr_pdrw_read(&impl->atomics,impl->cq,&complete); - - // register a callback - cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_sort_execute_cb,impl)); - cl(ReleaseEvent(complete)); - - // flush cq - cl(Flush(impl->cq)); -} - -// -// -// - -static -void -skc_composition_raster_release(struct skc_composition_impl * const impl) -{ - // - // reference counts to rasters can only be released when the - // composition is unsealed and the atomics are reset. - // - skc_runtime_raster_device_release(impl->runtime, - impl->saved.extent.hrw, - impl->saved.count); - // reset count - impl->saved.count = 0; -} - -// -// -// - -static -void -skc_composition_unseal_block(struct skc_composition_impl * const impl, - skc_bool const block) -{ - // return if already unsealed - if (impl->state == SKC_COMPOSITION_STATE_UNSEALED) - return; - - // - // otherwise, we're going to need to pump the scheduler - // - struct skc_scheduler * const scheduler = impl->runtime->scheduler; - - // - // wait for UNSEALING > UNSEALED transition - // - if (impl->state == SKC_COMPOSITION_STATE_UNSEALING) - { - if (block) { - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED); - } - return; - } - - // - // wait for SEALING > SEALED transition ... - // - if (impl->state == SKC_COMPOSITION_STATE_SEALING) - { - // wait if sealing - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_SEALED); - } - - // wait for rendering locks to be released - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0); - - // - // no need to visit UNSEALING state with this implementation - // - - // acquire a new grid - impl->grids.sort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, - NULL, // the composition state guards this - impl, - NULL, // no waiting - skc_composition_sort_grid_pfn_execute, - NULL); // no dispose - - // mark composition as unsealed - impl->state = SKC_COMPOSITION_STATE_UNSEALED; -} - -// -// can only be called on a composition that was just unsealed -// -static -void -skc_composition_reset(struct skc_composition_impl * const impl) -{ - // zero the atomics - skc_extent_phr_pdrw_zero(&impl->atomics,impl->cq,NULL); - - // flush it - cl(Flush(impl->cq)); - - // release all the rasters - skc_composition_raster_release(impl); -} - -static -void -skc_composition_unseal_block_reset(struct skc_composition_impl * const impl, - skc_bool const block, - skc_bool const reset) -{ - skc_composition_unseal_block(impl,block); - - if (reset) { - skc_composition_reset(impl); - } -} - -// -// -// - -static -void -skc_composition_pfn_unseal(struct skc_composition_impl * const impl, skc_bool const reset) -{ - skc_composition_unseal_block_reset(impl,false,reset); -} - -// -// only needs to create a grid -// - -static -void -skc_composition_place_create(struct skc_composition_impl * const impl) -{ - // acquire a grid - impl->grids.place = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, - &impl->grids.place, - NULL, - NULL, // no waiting - skc_composition_place_grid_pfn_execute, - skc_composition_place_grid_pfn_dispose); - - // assign happens-after relationship - skc_grid_happens_after_grid(impl->grids.sort,impl->grids.place); -} - - -static -skc_err -skc_composition_pfn_place(struct skc_composition_impl * const impl, - skc_raster_t const * rasters, - skc_layer_id const * layer_ids, - skc_float const * txs, - skc_float const * tys, - skc_uint count) -{ - // block and yield if not unsealed - skc_composition_unseal_block(impl,true); - - // - // validate and retain all rasters - // - skc_err err; - - err = skc_runtime_handle_device_validate_retain(impl->runtime, - SKC_TYPED_HANDLE_TYPE_IS_RASTER, - rasters, - count); - if (err) - return err; - - skc_runtime_handle_device_retain(impl->runtime,rasters,count); - - // - // save the stripped handles - // - skc_raster_t * saved = impl->saved.extent.hrw; - - saved += impl->saved.count; - impl->saved.count += count; - - for (skc_uint ii=0; iiruntime->scheduler,(rem = skc_extent_ring_wip_rem(&impl->cmds.ring)) == 0); - - // append commands - skc_uint avail = min(rem,count); - - // decrement count - count -= avail; - - // launch a place kernel after copying commands? - skc_bool const is_wip_full = (avail == rem); - - // if there is no place grid then create one - if (impl->grids.place == NULL) - { - skc_composition_place_create(impl); - } - - // - // FIXME -- OPTIMIZATION? -- the ring_wip_index_inc() test can - // be avoided by splitting into at most two intervals. It should - // be plenty fast as is though so leave for now. - // - union skc_cmd_place * const cmds = impl->cmds.extent.hw1; - - if ((txs == NULL) && (tys == NULL)) - { - while (avail-- > 0) - { - skc_raster_t const raster = *saved++; - - skc_grid_happens_after_handle(impl->grids.place,raster); - - cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = - (union skc_cmd_place){ raster, *layer_ids++, 0, 0 }; - } - } - else if (txs == NULL) - { - while (avail-- > 0) - { - skc_raster_t const raster = *saved++; - - skc_grid_happens_after_handle(impl->grids.place,raster); - - cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = - (union skc_cmd_place){ raster, - *layer_ids++, - 0, - SKC_PLACE_CMD_TY_CONVERT(*tys++) }; - } - } - else if (tys == NULL) - { - while (avail-- > 0) - { - skc_raster_t const raster = *saved++; - - skc_grid_happens_after_handle(impl->grids.place,raster); - - cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = - (union skc_cmd_place){ raster, - *layer_ids++, - SKC_PLACE_CMD_TX_CONVERT(*txs++), - 0 }; - } - } - else - { - while (avail-- > 0) - { - skc_raster_t const raster = *saved++; - - skc_grid_happens_after_handle(impl->grids.place,raster); - - cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = - (union skc_cmd_place){ raster, - *layer_ids++, - SKC_PLACE_CMD_TX_CONVERT(*txs++), - SKC_PLACE_CMD_TY_CONVERT(*tys++) }; - } - } - - // launch place kernel? - if (is_wip_full) { - skc_composition_snap(impl); - } - } while (count > 0); - - return SKC_ERR_SUCCESS; -} - -// -// -// - -static -void -skc_composition_pfn_bounds(struct skc_composition_impl * const impl, skc_int bounds[4]) -{ - // - // FIXME -- not implemented yet - // - // impl bounds will be copied back after sealing - // - bounds[0] = SKC_INT_MIN; - bounds[1] = SKC_INT_MIN; - bounds[2] = SKC_INT_MAX; - bounds[3] = SKC_INT_MAX; -} - -// -// -// - -void -skc_composition_retain_and_lock(struct skc_composition * const composition) -{ - skc_composition_retain(composition); - - composition->impl->lock_count += 1; -} - -void -skc_composition_unlock_and_release(struct skc_composition * const composition) -{ - composition->impl->lock_count -= 1; - - skc_composition_pfn_release(composition->impl); -} - -// -// -// - -skc_err -skc_composition_cl_12_create(struct skc_context * const context, - struct skc_composition * * const composition) -{ - struct skc_runtime * const runtime = context->runtime; - - // retain the context - // skc_context_retain(context); - - // allocate impl - struct skc_composition_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); - - // allocate composition - (*composition) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**composition)); - - (*composition)->context = context; - (*composition)->impl = impl; - (*composition)->ref_count = 1; - - (*composition)->place = skc_composition_pfn_place; - (*composition)->unseal = skc_composition_pfn_unseal; - (*composition)->seal = skc_composition_pfn_seal; - (*composition)->bounds = skc_composition_pfn_bounds; - (*composition)->release = skc_composition_pfn_release; - - // intialize impl - impl->composition = (*composition); - impl->runtime = runtime; - - SKC_ASSERT_STATE_INIT(impl,SKC_COMPOSITION_STATE_SEALED); - - impl->lock_count = 0; - - impl->grids.sort = NULL; - impl->grids.place = NULL; - - // acquire command queue for sealing/unsealing - impl->cq = skc_runtime_acquire_cq_in_order(runtime); - - // acquire kernels - impl->kernels.place = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PLACE); - impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK); - - // get config - struct skc_config const * const config = runtime->config; - - // initialize ring size with config values - skc_extent_ring_init(&impl->cmds.ring, - config->composition.cmds.elem_count, - config->composition.cmds.snap_count, - sizeof(union skc_cmd_place)); - - skc_extent_phw1g_tdrNs_alloc(runtime,&impl->cmds.extent ,sizeof(union skc_cmd_place) * config->composition.cmds.elem_count); - skc_extent_phrw_alloc (runtime,&impl->saved.extent,sizeof(skc_raster_t) * config->composition.raster_ids.elem_count); - skc_extent_phr_pdrw_alloc (runtime,&impl->atomics ,sizeof(struct skc_place_atomics)); - - skc_extent_pdrw_alloc (runtime,&impl->keys ,sizeof(skc_ttxk_t) * config->composition.keys.elem_count); - skc_extent_pdrw_alloc (runtime,&impl->offsets ,sizeof(skc_uint) * (1u << SKC_TTCK_HI_BITS_YX)); // 1MB - - // nothing saved - impl->saved.count = 0; - - // unseal the composition, zero the atomics, etc. - skc_composition_unseal_block_reset(impl,false,true); - - return SKC_ERR_SUCCESS; -} - -// -// -// diff --git a/src/compute/skc/composition_cl_12.h b/src/compute/skc/composition_cl_12.h deleted file mode 100644 index 4f52090658..0000000000 --- a/src/compute/skc/composition_cl_12.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include - -#include "composition.h" -#include "assert_state.h" -#include "grid.h" -#include "extent_cl_12.h" -#include "extent_ring.h" - -// -// composition states -// - -typedef enum skc_composition_state_e { - - SKC_COMPOSITION_STATE_UNSEALING, - SKC_COMPOSITION_STATE_UNSEALED, - SKC_COMPOSITION_STATE_SEALING, - SKC_COMPOSITION_STATE_SEALED - -} skc_composition_state_e; - -// -// IMPL -// - -struct skc_composition_impl -{ - struct skc_composition * composition; - struct skc_runtime * runtime; - - SKC_ASSERT_STATE_DECLARE(skc_composition_state_e); - - skc_int lock_count; // wip renders - - struct { - skc_grid_t sort; - skc_grid_t place; - } grids; - - cl_command_queue cq; - - struct { - cl_kernel place; - cl_kernel segment; - } kernels; - - // raster ids must be held until the composition is reset or - // released and then their refcounts can be decremented - struct { - struct skc_extent_phrw extent; - skc_uint count; - } saved; - - struct { - struct skc_extent_ring ring; // how many slots left? - struct skc_extent_phw1g_tdrNs extent; // wip command extent - } cmds; - - // composition extent length - struct skc_extent_phr_pdrw atomics; - - // composition ttck extent - struct skc_extent_pdrw keys; - - // key offsets in sealed and sorted ttck extent - struct skc_extent_pdrw offsets; -}; - -// -// ATOMICS -// - -struct skc_place_atomics -{ - skc_uint keys; - skc_uint offsets; -}; - -// -// ONLY VISIBLE WITHIN THIS RUNTIME -// - -void -skc_composition_retain_and_lock(struct skc_composition * const composition); - -void -skc_composition_unlock_and_release(struct skc_composition * const composition); - -// -// -// diff --git a/src/compute/skc/config_cl.h b/src/compute/skc/config_cl.h deleted file mode 100644 index 0172857b07..0000000000 --- a/src/compute/skc/config_cl.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "runtime_cl.h" -#include "block_pool_cl.h" - -// -// FIXME -- define individual structs before defining skc_config -// - -struct skc_config -{ - struct { - struct { - skc_uint size; - skc_uint subbufs; - } host; // alignment determined by compiler - struct { - skc_uint size; - skc_uint subbufs; - } device; // alignment determined by device - } suballocator; - - struct { - skc_uint size; - } scheduler; - - struct { - skc_uint bytes; // bytes per subblock -- pow2 - skc_uint words; // words per subblock -- pow2 - // skc_uint words_log2; - } subblock; - - struct { - skc_uint bytes; // bytes per block -- pow2 - skc_uint words; // words per block -- pow2 - skc_uint subblocks; // subblocks per block -- block.bytes >= subblock.bytes - // skc_uint subblocks_log2; - } block; - - union skc_block_pool_size block_pool; - - struct { - skc_cq_type_e type; - skc_uint size; - } cq_pool; - - struct { - skc_uint size; // a large fraction of block pool size - skc_uint width; // determines number of launched reclamation subgroups - skc_uint recs; // how many in-flight width-subgroup reclamation grids - } handle_pool; - - struct { - skc_uint width; // tile width in pixels - skc_uint height; // tile height in pixels - skc_uint ratio; // subblocks per TTPB - } tile; - - struct { - struct { - skc_uint count; // # of subbufs in buffer - } buffer; - - struct { - skc_uint count; // # of blocks/commands in subbuf - } subbuf; - - struct { - size_t buffer; // block.bytes * subbuf.blocks * subbuf.count - size_t subbuf; // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN - } block; - - struct { - size_t buffer; // sizeof(skc_uint) * subbuf.blocks * subbuf.count - size_t subbuf; // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN - } command; - // - // skc_uint paths_lowat; - // - } paths_copy; - - struct { - struct { - skc_uint elem_count; - skc_uint snap_count; - } path_ids; - - struct { - skc_uint elem_count; - skc_uint snap_count; - } transforms; - - struct { - skc_uint elem_count; - skc_uint snap_count; - } clips; - - struct { - skc_uint elem_count; - skc_uint snap_count; - } fill; - - struct { - skc_uint elem_count; - skc_uint snap_count; - } raster_ids; - - struct { - skc_uint cmds; - } expand; - - struct { - skc_uint keys; - } rasterize; - } raster_cohort; - - struct { - struct { - skc_uint elem_count; - skc_uint snap_count; - } cmds; - - struct { - skc_uint elem_count; - } raster_ids; - - struct { - skc_uint elem_count; - } keys; - } composition; -}; - -// -// -// diff --git a/src/compute/skc/cq_pool_cl.c b/src/compute/skc/cq_pool_cl.c deleted file mode 100644 index 80cfe34cf8..0000000000 --- a/src/compute/skc/cq_pool_cl.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#ifndef NDEBUG -#include -#endif - -// -// -// - -#include - -// -// -// - -#include "runtime_cl_12.h" - -// -// This implementation is probably excessive. -// -// The command queue pool could easily be replaced with simply an LRU -// or even round-robin reuse pool. Even a small number of aliased -// command queues can probably enough concurrency. -// - -#define SKC_CQ_POOL_EXPAND 1 - -// -// -// - -void -skc_cq_pool_create(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool, - skc_uint const type, - skc_uint const size) -{ - pool->type = type; - pool->size = size + 1; // an empty spot - pool->reads = 0; - pool->writes = size; - pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq)); - - for (skc_uint ii=0; iicq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); - } - pool->cq[size] = NULL; -} - -// -// -// - -void -skc_cq_pool_dispose(struct skc_runtime * const runtime, - struct skc_cq_pool * pool) -{ - // - // FIXME -- release the command queues after waiting for the ring to - // be full with pool.size queues? - // - skc_runtime_host_perm_free(runtime,pool->cq); -} - -// -// -// - -static -void -skc_cq_pool_write(struct skc_cq_pool * const pool, - cl_command_queue cq) -{ - pool->cq[pool->writes++ % pool->size] = cq; -} - -// -// only expand when completely empty -// - -static -void -skc_cq_pool_expand(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool, - skc_uint expand) -{ -#ifndef NDEBUG - fprintf(stderr,"Expanding the cq_pool by: %u (%u)\n",expand,pool->size); -#endif - - // free old - skc_runtime_host_perm_free(runtime,pool->cq); - - // the ring is empty - pool->size += expand; - pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq)); - pool->reads = 0; - pool->writes = expand; - - for (skc_uint ii=0; iicq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); -} - -// -// -// - -static -cl_command_queue -skc_cq_pool_read(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool) -{ - // any command queues left? - if (pool->reads == pool->writes) - skc_cq_pool_expand(runtime,pool,SKC_CQ_POOL_EXPAND); - - cl_command_queue cq = pool->cq[pool->reads++ % pool->size]; - - return cq; -} - -// -// -// - -cl_command_queue -skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime) -{ - return skc_cq_pool_read(runtime,&runtime->cq_pool); -} - -void -skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, - cl_command_queue cq) -{ - skc_cq_pool_write(&runtime->cq_pool,cq); -} - -// -// -// diff --git a/src/compute/skc/cq_pool_cl.h b/src/compute/skc/cq_pool_cl.h deleted file mode 100644 index 0cc73a2f82..0000000000 --- a/src/compute/skc/cq_pool_cl.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -#include "types.h" - -// -// Why we need to wrap command queue creation: -// -// - command queue creation is expensive -// -// - the CL 1.2 function is deprecated in 2.0 -// - -struct skc_cq_pool -{ - skc_cq_type_e type; - skc_uint size; - skc_uint reads; - skc_uint writes; - cl_command_queue * cq; -}; - -//l -// -// - -void -skc_cq_pool_create(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool, - skc_uint const type, - skc_uint const size); - -void -skc_cq_pool_dispose(struct skc_runtime * const runtime, - struct skc_cq_pool * pool); - -// -// -// diff --git a/src/compute/skc/device_cl_12.h b/src/compute/skc/device_cl_12.h deleted file mode 100644 index 637b61ae10..0000000000 --- a/src/compute/skc/device_cl_12.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include - -// -// -// - -#define SKC_CL_ARG(arg) sizeof(arg),&arg - -// -// -// - -typedef enum skc_device_kernel_id { - SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS, - SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS, - - SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, - SKC_DEVICE_KERNEL_ID_PATHS_COPY, - - SKC_DEVICE_KERNEL_ID_FILLS_EXPAND, - - SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL, - SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES, - SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS, - SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS, - SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS, - SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS, - - SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK, - SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC, - - SKC_DEVICE_KERNEL_ID_PREFIX, - SKC_DEVICE_KERNEL_ID_PLACE, - SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK, - - SKC_DEVICE_KERNEL_ID_RENDER, - - SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM, - SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM, - - // - SKC_DEVICE_KERNEL_ID_COUNT - -} skc_device_kernel_id; - -// -// -// - -void -skc_device_create(struct skc_runtime * const runtime); - - -void -skc_device_dispose(struct skc_runtime * const runtime); - - -// -// multi-threading/context/device requires multiple kernel instances -// - -cl_kernel -skc_device_acquire_kernel(struct skc_device * const device, - skc_device_kernel_id const type); - -// -// grid shape can vary greatly by target platform -// -void -skc_device_enqueue_kernel(struct skc_device * const device, - skc_device_kernel_id const type, - cl_command_queue cq, - cl_kernel kernel, - size_t const work_size, - cl_uint num_events_in_wait_list, - cl_event const * const event_wait_list, - cl_event * const event); - -// -// -// diff --git a/src/compute/skc/device_cl_12_avx2.h b/src/compute/skc/device_cl_12_avx2.h deleted file mode 100644 index e68579c0f7..0000000000 --- a/src/compute/skc/device_cl_12_avx2.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_ONCE_DEVICE_CL_12_AVX2_H -#define SKC_ONCE_DEVICE_CL_12_AVX2_H - -// -// -// - -#define SKC_DEVICE_BLOCK_WORDS_LOG2 6 -#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2 4 - -// -// -// - -#define SKC_DEVICE_BLOCK_WORDS (1u << SKC_DEVICE_BLOCK_WORDS_LOG2) -#define SKC_DEVICE_SUBBLOCK_WORDS (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2) - -// -// -// - -#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS) - -// -// -// - -#define SKC_COPY_PATHS_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS -#define SKC_COPY_PATHS_ELEM_WORDS 1 - -// -// -// - -#define SKC_EXPAND_FILLS_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS -#define SKC_EXPAND_FILLS_ELEM_WORDS 1 - -// -// -// - -#define SKC_RASTERIZE_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/device_cl_12_gen9.c b/src/compute/skc/device_cl_12_gen9.c deleted file mode 100644 index 5b4d9d2dd2..0000000000 --- a/src/compute/skc/device_cl_12_gen9.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include -#include -#include - -#include "common/cl/assert_cl.h" -#include "macros.h" - -#include "config_cl.h" -#include "runtime_cl_12.h" - -#include "raster.h" -#include "tile.h" - -#include "hs/cl/hs_cl_launcher.h" -#include "hs/cl/gen9/hs_cl.h" - -// -// -// - -#define SKC_KERNEL_SPIRV 0 -#define SKC_KERNEL_BINARY 1 -#define SKC_KERNEL_SRC 0 - -// -// -// - -#if SKC_KERNEL_SPIRV - -#include "block_pool_init.pre.spv.inl" -#include "paths_copy.pre.spv.inl" -#include "fills_expand.pre.spv.inl" -#include "rasterize.pre.spv.inl" -#include "segment_ttrk.pre.spv.inl" -#include "rasters_alloc.pre.spv.inl" -#include "prefix.pre.spv.inl" -#include "place.pre.spv.inl" -#include "segment_ttck.pre.spv.inl" -#include "render.pre.spv.inl" -#include "paths_reclaim.pre.spv.inl" -#include "rasters_reclaim.pre.spv.inl" - -#elif SKC_KERNEL_BINARY - -#include "block_pool_init.pre.bin.inl" -#include "paths_copy.pre.bin.inl" -#include "fills_expand.pre.bin.inl" -#include "rasterize.pre.bin.inl" -#include "segment_ttrk.pre.bin.inl" -#include "rasters_alloc.pre.bin.inl" -#include "prefix.pre.bin.inl" -#include "place.pre.bin.inl" -#include "segment_ttck.pre.bin.inl" -#include "render.pre.bin.inl" -#include "paths_reclaim.pre.bin.inl" -#include "rasters_reclaim.pre.bin.inl" - -#elif SKC_KERNEL_SRC - -#include "block_pool_init.pre.src.inl" -#include "paths_copy.pre.src.inl" -#include "fills_expand.pre.src.inl" -#include "rasterize.pre.src.inl" -#include "segment_ttrk.pre.src.inl" -#include "rasters_alloc.pre.src.inl" -#include "prefix.pre.src.inl" -#include "place.pre.src.inl" -#include "segment_ttck.pre.src.inl" -#include "render.pre.src.inl" -#include "paths_reclaim.pre.src.inl" -#include "rasters_reclaim.pre.src.inl" - -#endif - -// -// -// - -#include "device_cl_12_gen9.h" - -// -// FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY -// - -static -struct skc_config const config = - { - .suballocator = { - .host = { - .size = 1024 * 1024, // words - .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) - }, - .device = { - .size = 128 * 1024 * 1024, - .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) - } - }, - - .scheduler = { - .size = 4096 // 128 // fixme -- this is just for testing -- too big - }, - - .subblock = { - .words = SKC_DEVICE_SUBBLOCK_WORDS, // words per subblock -- pow2 - .bytes = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint) // bytes per subblock -- pow2 - }, - - .block = { - .words = SKC_DEVICE_BLOCK_WORDS, // words per block -- pow2 - .bytes = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint), // bytes per block -- pow2 - .subblocks = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes - }, - - .block_pool = { - .pool_size = 524288, // blocks in pool -- 128 MB - .ring_pow2 = 524288, // blocks in pool rounded up pow2 - .ring_mask = 524288 - 1 - }, - - .cq_pool = { -#ifndef NDEBUG - .type = SKC_CQ_TYPE_IN_ORDER_PROFILING, -#else - .type = 0, -#endif - .size = 8 - }, - - .handle_pool = { - .size = 262144, // large fraction of block pool size (for now, 1:2) - .width = SKC_RECLAIM_ARRAY_SIZE, - .recs = 256 // too many? too few? - }, - - .tile = { - .width = SKC_TILE_WIDTH, // tile width in pixels - .height = SKC_TILE_HEIGHT, // tile height in pixels - .ratio = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB - }, - - .paths_copy = { - - .buffer = { - .count = 16 // # of subbufs in buffer - }, - - .subbuf = { - .count = 1024 // # of blocks/commands in subbuf - }, - - .block = { - .subbuf = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024, // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN - .buffer = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count - }, - - .command = { - .subbuf = sizeof(skc_uint) * 1024, // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN - .buffer = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count - }, - - // skc_uint paths_lowat; - }, - - .raster_cohort = { - .path_ids = { - .elem_count = 8192, - .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER - }, - - .transforms = { - .elem_count = 8192, - .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER - }, - - .clips = { - .elem_count = 8192, - .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER - }, - - .fill = { - .elem_count = 8192, - .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER - }, - - .raster_ids = { - .elem_count = 8192, - .snap_count = (1<device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err); - - // - // release program now - // - // FIXME -- if/when we multithread then we need to clone kernels - // (>=2.1) or keep programs around (<=2.0) - // - - // get workgroup size - cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], - runtime->cl.device_id, - CL_KERNEL_COMPILE_WORK_GROUP_SIZE, - sizeof(runtime->device->reqd_szs[0]), - runtime->device->reqd_szs[id], - NULL)); - - // - // GEN9+ PROBING - // -#define SKC_TARGET_GEN9 -#ifdef SKC_TARGET_GEN9 - -#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 -#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 -#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A - - cl_ulong spill_mem_size; - - cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], - runtime->cl.device_id, - CL_KERNEL_SPILL_MEM_SIZE_INTEL, - sizeof(spill_mem_size), - &spill_mem_size, - NULL)); - - fprintf(stderr,"\t\tspill mem size: %lu bytes\n", - (unsigned long)spill_mem_size); - - cl_ulong local_mem_size; - - cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], - runtime->cl.device_id, - CL_KERNEL_LOCAL_MEM_SIZE, - sizeof(local_mem_size), - &local_mem_size, - NULL)); - - fprintf(stderr,"\t\tlocal mem size: %lu bytes\n", - (unsigned long)local_mem_size); -#endif - } -} - -static -void -skc_device_build_program(struct skc_runtime * const runtime, - struct skc_program_source const * const source, - struct skc_program_kernel const * const kernels, - skc_uint const kernel_count) -{ - cl_program program; - - fprintf(stderr,"%-20s: ",source->name); - - cl_int cl_err; - -#if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V - - fprintf(stderr,"Creating (SPIR-V) ... "); - - program = clCreateProgramWithIL(runtime->cl.context, - source->src, - source->srclen, - &cl_err); - -#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY - - fprintf(stderr,"Creating (Binary) ... "); - - cl_int status; - program = clCreateProgramWithBinary(runtime->cl.context, - 1, - &runtime->cl.device_id, - &source->srclen, - (unsigned char const *[]){ source->src }, - &status, - &cl_err); - -#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE - - fprintf(stderr,"Creating (Source) ... "); - - program = clCreateProgramWithSource(runtime->cl.context, - 1, - (char const *[]){ source->src }, - &source->srclen, - &cl_err); -#else - -#error "SKC_KERNEL_???" - -#endif - - cl_ok(cl_err); - - fprintf(stderr,"Building ... "); - - // build the program - cl(BuildProgram(program, - 1, - &runtime->cl.device_id, - source->options, // build options are ignored by binary - NULL, - NULL)); - - fprintf(stderr,"Done\n"); - - // build the kernels - skc_device_create_kernels(runtime,kernels,kernel_count,program); - - // we're done with program for now - // can always recover it from a kernel instance - cl(ReleaseProgram(program)); -} - -// -// RELEASE KERNELS -// - -static -void -skc_device_release_kernels(struct skc_device * const device) -{ - for (skc_int ii=0; iikernels); ii++) - cl(ReleaseKernel(device->kernels[ii])); -} - - - -cl_kernel -skc_device_acquire_kernel(struct skc_device * const device, - skc_device_kernel_id const type) -{ - cl_kernel kernel = device->kernels[type]; - - cl(RetainKernel(kernel)); - - return kernel; -} - -// -// INITIALIZE KERNEL ARGS -// -// FIXME -// -// pre-assign any kernel arguments that are never going to change -- -// for example, the block pool -// - -// -// -// - -#define SKC_DEVICE_BUILD_PROGRAM(p) \ - skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p)) - - -void -skc_device_create(struct skc_runtime * const runtime) -{ - struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device)); - - // hang device off of runtime - runtime->device = device; - - // hang config off of runtime - runtime->config = &config; - - // create kernels - SKC_DEVICE_BUILD_PROGRAM(block_pool_init); - SKC_DEVICE_BUILD_PROGRAM(paths_copy); - SKC_DEVICE_BUILD_PROGRAM(fills_expand); - SKC_DEVICE_BUILD_PROGRAM(rasterize); - SKC_DEVICE_BUILD_PROGRAM(segment_ttrk); - SKC_DEVICE_BUILD_PROGRAM(rasters_alloc); - SKC_DEVICE_BUILD_PROGRAM(prefix); - SKC_DEVICE_BUILD_PROGRAM(place); - SKC_DEVICE_BUILD_PROGRAM(segment_ttck); - SKC_DEVICE_BUILD_PROGRAM(render); - SKC_DEVICE_BUILD_PROGRAM(paths_reclaim); - SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim); - - // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up - hs_create(runtime->cl.context,runtime->cl.device_id,NULL); -} - -void -skc_device_dispose(struct skc_runtime * const runtime) -{ - // - // FIXME -- dispose of programs, kernels, etc. - // - - skc_runtime_host_perm_free(runtime,runtime->device); -} - -// -// FIXME -- just pass the device type -// - -void -skc_device_enqueue_kernel(struct skc_device * const device, - skc_device_kernel_id const type, - cl_command_queue cq, - cl_kernel kernel, - size_t const work_size, - cl_uint num_events_in_wait_list, - cl_event const * const event_wait_list, - cl_event * const event) -{ - if (work_size == 0) - return; - - cl_uint work_dim [1]; - size_t work_global[3]; - size_t work_local [3]; - - size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size, - work_dim, - work_global, - work_local); - cl(EnqueueNDRangeKernel(cq, - kernel,// device->kernels[type], - work_dim[0], - NULL, - work_global, - work_local_ptr, - num_events_in_wait_list, - event_wait_list, - event)); -} - -// -// -// diff --git a/src/compute/skc/device_cl_12_gen9.h b/src/compute/skc/device_cl_12_gen9.h deleted file mode 100644 index dd69a845c2..0000000000 --- a/src/compute/skc/device_cl_12_gen9.h +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_ONCE_DEVICE_CL_12_GEN9_H -#define SKC_ONCE_DEVICE_CL_12_GEN9_H - -// -// FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE -// -// THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED -// - -#include "block.h" - -// -// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES: -// -// 1) The subblock size should match the natural SIMT/SIMD width of -// the target device. -// -// 2) Either a square or rectangular (1:2) tile size is chosen. The -// tile size is usually determined by the amount of SMEM available -// to a render kernel subgroup and desired multiprocessor -// occupancy. -// -// 3) If the tile is rectangular then the block size must be at least -// twice the size of the subblock size. -// -// 4) A large block size can decrease allocation overhead but there -// will be diminishing returns as the block size increases. -// - -#define SKC_DEVICE_BLOCK_WORDS_LOG2 6 // CHANGE "WORDS" TO "SIZE" ? -#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2 3 - -#define SKC_TILE_WIDTH_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 -#define SKC_TILE_HEIGHT_LOG2 (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1) - -///////////////////////////////////////////////////////////////// -// -// BLOCK POOL INIT -// - -#define SKC_BP_INIT_IDS_KERNEL_ATTRIBS -#define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(2,1,1))) - -///////////////////////////////////////////////////////////////// -// -// PATHS ALLOC -// - -#define SKC_PATHS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(1,1,1))) - -///////////////////////////////////////////////////////////////// -// -// PATHS COPY -// - -#define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? -#define SKC_PATHS_COPY_ELEM_WORDS 1 -#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_1() - -#define SKC_PATHS_COPY_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE))) - -#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS) - -typedef skc_uint skc_paths_copy_elem; -typedef skc_uint skc_pb_idx_v; - -///////////////////////////////////////////////////////////////// -// -// FILLS EXPAND -// - -#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 -#define SKC_FILLS_EXPAND_ELEM_WORDS 1 - -#define SKC_FILLS_EXPAND_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE))) - -///////////////////////////////////////////////////////////////// -// -// RASTER ALLOC -// -// NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is -// why use of the subgroup broadcast produces a compiler error. So a -// subgroup of size 16 is this widest we can require. -// - -#define SKC_RASTERS_ALLOC_GROUP_SIZE 16 - -#if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16) - -#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE))) -#define SKC_RASTERS_ALLOC_LOCAL_ID() get_sub_group_local_id() -#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) sub_group_scan_inclusive_add(v) -#define SKC_RASTERS_ALLOC_BROADCAST(v,i) sub_group_broadcast(v,i) - -#else - -#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1))) -#define SKC_RASTERS_ALLOC_LOCAL_ID() get_local_id(0) -#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) work_group_scan_inclusive_add(v) -#define SKC_RASTERS_ALLOC_BROADCAST(v,i) work_group_broadcast(v,i) - -#endif - -///////////////////////////////////////////////////////////////// -// -// RASTERIZE -// - -#define SKC_RASTERIZE_SUBGROUP_SIZE SKC_DEVICE_SUBBLOCK_WORDS -#define SKC_RASTERIZE_VECTOR_SIZE_LOG2 0 -#define SKC_RASTERIZE_WORKGROUP_SUBGROUPS 1 - -#define SKC_RASTERIZE_KERNEL_ATTRIBS \ - __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \ - __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1))) - -#define SKC_RASTERIZE_FLOAT float -#define SKC_RASTERIZE_UINT uint -#define SKC_RASTERIZE_INT int -#define SKC_RASTERIZE_PREDICATE bool -#define SKC_RASTERIZE_POOL uint - -#define SKC_RASTERIZE_TILE_HASH_X_BITS 1 -#define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 - -typedef skc_block_id_t skc_block_id_v_t; -typedef skc_uint2 skc_ttsk_v_t; -typedef skc_uint2 skc_ttsk_s_t; - -// SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE); - -///////////////////////////////////////////////////////////////// -// -// PREFIX -// - -#define SKC_PREFIX_SUBGROUP_SIZE 8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS -#define SKC_PREFIX_WORKGROUP_SUBGROUPS 1 - -#define SKC_PREFIX_KERNEL_ATTRIBS \ - __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE))) \ - __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1))) - -#define SKC_PREFIX_TTP_V skc_uint2 -#define SKC_PREFIX_TTS_V_BITFIELD skc_int - -#define SKC_PREFIX_TTS_VECTOR_INT_EXPAND SKC_EXPAND_1 - -#define SKC_PREFIX_SMEM_ZERO ulong -#define SKC_PREFIX_SMEM_ZERO_WIDTH (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t)) -#define SKC_PREFIX_SMEM_COUNT_BLOCK_ID 8 - -#define SKC_PREFIX_BLOCK_ID_V_SIZE SKC_PREFIX_SUBGROUP_SIZE - -#define SKC_PREFIX_TTXK_V_SIZE SKC_PREFIX_SUBGROUP_SIZE -#define SKC_PREFIX_TTXK_V_MASK (SKC_PREFIX_TTXK_V_SIZE - 1) - -typedef skc_uint skc_bp_elem_t; - -typedef skc_uint2 skc_ttrk_e_t; -typedef skc_uint2 skc_ttsk_v_t; -typedef skc_uint2 skc_ttsk_s_t; -typedef skc_uint2 skc_ttpk_s_t; -typedef skc_uint2 skc_ttxk_v_t; - -typedef skc_int skc_tts_v_t; - -typedef skc_int skc_ttp_t; - -typedef skc_uint skc_raster_yx_s; - -typedef skc_block_id_t skc_block_id_v_t; -typedef skc_block_id_t skc_block_id_s_t; - -///////////////////////////////////////////////////////////////// -// -// PLACE -// - -#define SKC_PLACE_SUBGROUP_SIZE 16 -#define SKC_PLACE_WORKGROUP_SUBGROUPS 1 - -#define SKC_PLACE_KERNEL_ATTRIBS \ - __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE))) \ - __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1))) - -typedef skc_uint skc_bp_elem_t; - -typedef skc_uint skc_ttsk_lo_t; -typedef skc_uint skc_ttsk_hi_t; - -typedef skc_uint skc_ttpk_lo_t; -typedef skc_uint skc_ttpk_hi_t; - -typedef skc_uint skc_ttxk_lo_t; -typedef skc_uint skc_ttxk_hi_t; - -typedef skc_uint2 skc_ttck_t; - -typedef skc_bool skc_pred_v_t; -typedef skc_int skc_int_v_t; - -///////////////////////////////////////////////////////////////// -// -// RENDER -// - -#define SKC_ARCH_GEN9 - -#if defined(__OPENCL_C_VERSION__) -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif - -#define SKC_RENDER_SUBGROUP_SIZE 8 -#define SKC_RENDER_WORKGROUP_SUBGROUPS 1 - -#define SKC_RENDER_KERNEL_ATTRIBS \ - __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE))) \ - __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1))) - -#define SKC_RENDER_SCANLINE_VECTOR_SIZE 2 - -#define SKC_RENDER_REGS_COLOR_R 2 -#define SKC_RENDER_REGS_COVER_R 3 - -#define SKC_RENDER_TTSB_EXPAND() SKC_EXPAND_1() - -#define SKC_RENDER_TTS_V skc_int -#define SKC_RENDER_TTS_V_BITFIELD skc_int - -#define SKC_RENDER_TTP_V skc_int2 -#define SKC_RENDER_AREA_V skc_int2 - -#define SKC_RENDER_TILE_COLOR_PAIR half2 -#define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v) vload2(x,v) - -#define SKC_RENDER_SURFACE_COLOR half4 -#define SKC_RENDER_SURFACE_WRITE write_imageh - -// #define SKC_RENDER_TTXB_VECTOR_INT int2 -// #define SKC_RENDER_TTXB_VECTOR_UINT uint2 - -#define SKC_RENDER_WIDE_AA ulong // SLM = 64 bytes/clock - -#define SKC_RENDER_TILE_COLOR half2 -#define SKC_RENDER_TILE_COVER half2 - -#define SKC_RENDER_ACC_COVER_INT int2 -#define SKC_RENDER_ACC_COVER_UINT uint2 - -#define SKC_RENDER_GRADIENT_FLOAT float2 -#define SKC_RENDER_GRADIENT_INT int2 -#define SKC_RENDER_GRADIENT_STOP int2 -#define SKC_RENDER_GRADIENT_FRAC half2 -#define SKC_RENDER_GRADIENT_COLOR_STOP half - -#define SKC_RENDER_SURFACE_U8_RGBA uint2 - -#define SKC_RENDER_TILE_COLOR_VECTOR uint16 -#define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint -#define SKC_RENDER_TILE_COLOR_VECTOR_COUNT ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR)) - -///////////////////////////////////////////////////////////////// -// -// PATHS & RASTERS RECLAIM -// -// FIXME -- investigate enabling the stride option for a smaller grid -// that iterates over a fixed number of threads. Since reclamation is -// a low-priority task, it's probably reasonable to trade longer -// reclamation times for lower occupancy of the device because it -// might delay the fastpath of the pipeline. -// - -#define SKC_RECLAIM_ARRAY_SIZE (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice - -///////////////////////////////////////////////////////////////// -// -// PATHS RECLAIM -// - -#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? -#define SKC_PATHS_RECLAIM_LOCAL_ELEMS 1 -#define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE))) - -///////////////////////////////////////////////////////////////// -// -// RASTERS RECLAIM -// - -#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? -#define SKC_RASTERS_RECLAIM_LOCAL_ELEMS 1 -#define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE))) - -// -// COMMON -- FIXME -- HOIST THESE ELSEWHERE -// - -#define SKC_DEVICE_BLOCK_WORDS (1u << SKC_DEVICE_BLOCK_WORDS_LOG2) -#define SKC_DEVICE_SUBBLOCK_WORDS (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2) - -#define SKC_DEVICE_BLOCK_DWORDS (SKC_DEVICE_BLOCK_WORDS / 2) - -#define SKC_DEVICE_BLOCK_WORDS_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2) -#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2) - -#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS) - -#define SKC_TILE_RATIO (SKC_TILE_HEIGHT / SKC_TILE_WIDTH) - -// -// -// - -#define SKC_PATHS_COPY_SUBGROUP_SIZE (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2) -#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2) -#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2) -#define SKC_FILLS_EXPAND_SUBGROUP_SIZE (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2) - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/export_cl_12.h b/src/compute/skc/export_cl_12.h deleted file mode 100644 index e577282791..0000000000 --- a/src/compute/skc/export_cl_12.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "skc.h" - -// -// -// - -skc_err -skc_path_builder_cl_12_create(struct skc_context * const context, - struct skc_path_builder * * const path_builder); - -// -// -// - -skc_err -skc_raster_builder_cl_12_create(struct skc_context * const context, - struct skc_raster_builder * * const raster_builder); - -// -// -// - -skc_err -skc_composition_cl_12_create(struct skc_context * const context, - struct skc_composition * * const composition); - -// -// -// - -skc_err -skc_styling_cl_12_create(struct skc_context * const context, - struct skc_styling * * const styling, - uint32_t const layers_count, - uint32_t const groups_count, - uint32_t const extras_count); - -// -// -// - -skc_err -skc_surface_cl_12_create(struct skc_context * const context, - struct skc_surface * * const surface); - -// -// -// - diff --git a/src/compute/skc/extent_cl_12.c b/src/compute/skc/extent_cl_12.c deleted file mode 100644 index 73676d8063..0000000000 --- a/src/compute/skc/extent_cl_12.c +++ /dev/null @@ -1,459 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include - -#include "common/cl/assert_cl.h" -#include "extent_cl_12.h" -#include "runtime_cl_12.h" - -// -// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY -// - -void -skc_extent_phrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrw * const extent, - size_t const size) -{ - extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); -} - -void -skc_extent_phrw_free(struct skc_runtime * const runtime, - struct skc_extent_phrw * const extent) -{ - skc_runtime_host_perm_free(runtime,extent->hrw); -} - -// -// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP -// - -void -skc_extent_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_pdrw * const extent, - size_t const size) -{ - extent->drw = skc_runtime_device_perm_alloc(runtime, - CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, - size); -} - -void -skc_extent_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_pdrw * const extent) -{ - skc_runtime_device_perm_free(runtime,extent->drw); -} - -// -// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING -// - -void -skc_extent_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_tdrw * const extent, - size_t const size) -{ - extent->size = size; - extent->drw = skc_runtime_device_temp_alloc(runtime, - CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, - size,&extent->id,NULL); -} - -void -skc_extent_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_tdrw * const extent) -{ - skc_runtime_device_temp_free(runtime,extent->drw,extent->id); -} - -void -skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - if (extent->size == 0) - return; - - skc_uint const zero = 0; - - cl(EnqueueFillBuffer(cq, - extent->drw, - &zero, - sizeof(zero), - 0, - extent->size, - 0,NULL,event)); -} - -// -// DURABLE SMALL EXTENTS BACKING ATOMICS -// - -void -skc_extent_phr_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phr_pdrw * const extent, - size_t const size) -{ - extent->size = size; - extent->hr = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_ONLY,size); - extent->drw = skc_runtime_device_perm_alloc(runtime,CL_MEM_READ_WRITE,size); -} - -void -skc_extent_phr_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_phr_pdrw * const extent) -{ - skc_runtime_host_perm_free(runtime,extent->hr); - skc_runtime_device_perm_free(runtime,extent->drw); -} - -void -skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - if (extent->size == 0) - return; - - cl(EnqueueReadBuffer(cq, - extent->drw, - CL_FALSE, - 0, - extent->size, - extent->hr, - 0,NULL,event)); -} - -void -skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - if (extent->size == 0) - return; - - skc_uint const zero = 0; - - cl(EnqueueFillBuffer(cq, - extent->drw, - &zero, - sizeof(zero), - 0, - extent->size, - 0,NULL,event)); -} - -// -// EPHEMERAL SMALL EXTENTS BACKING ATOMICS -// - -void -skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_thr_tdrw * const extent, - size_t const size) -{ - extent->size = size; - extent->hr = skc_runtime_host_temp_alloc(runtime, - SKC_MEM_FLAGS_READ_WRITE, - size,&extent->id.hr,NULL); - extent->drw = skc_runtime_device_temp_alloc(runtime, - CL_MEM_READ_WRITE, - size, - &extent->id.drw, - NULL); -} - -void -skc_extent_thr_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_thr_tdrw * const extent) -{ - skc_runtime_host_temp_free(runtime,extent->hr,extent->id.hr); - skc_runtime_device_temp_free(runtime,extent->drw,extent->id.drw); -} - -void -skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - if (extent->size == 0) - return; - - cl(EnqueueReadBuffer(cq, - extent->drw, - CL_FALSE, - 0, - extent->size, - extent->hr, - 0,NULL,event)); -} - -void -skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - if (extent->size == 0) - return; - - skc_uint const zero = 0; - - cl(EnqueueFillBuffer(cq, - extent->drw, - &zero, - sizeof(zero), - 0, - extent->size, - 0,NULL,event)); -} - -// -// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT -// - -void -skc_extent_phw1g_tdrNs_alloc(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent, - size_t const size) -{ - extent->hw1 = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_WRITE_ONLY,size); -} - -void -skc_extent_phw1g_tdrNs_free(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent) -{ - skc_runtime_host_perm_free(runtime,extent->hw1); -} - -void -skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phw1g_tdrNs_snap * const snap) -{ - snap->snap = skc_extent_ring_snap_alloc(runtime,ring); -} - -void -skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent, - struct skc_extent_phw1g_tdrNs_snap * const snap, - cl_command_queue const cq, - cl_event * const event) -{ - struct skc_extent_ring const * const ring = snap->snap->ring; - - skc_uint const count = skc_extent_ring_snap_count(snap->snap); - size_t const size = count * ring->size.elem; - - snap->drN = skc_runtime_device_temp_alloc(runtime, - CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, - size,&snap->id,NULL); - - if (count == 0) - return; - - // possibly two copies - skc_uint const index_lo = snap->snap->reads & ring->size.mask; - skc_uint const count_max = ring->size.pow2 - index_lo; - skc_uint const count_lo = min(count_max,count); - size_t const bytes_lo = count_lo * ring->size.elem; - - if (count > count_max) - { - skc_uint const bytes_hi = (count - count_max) * ring->size.elem; - - cl(EnqueueWriteBuffer(cq, - snap->drN, - CL_FALSE, - bytes_lo, - bytes_hi, - extent->hw1, // offset_hi = 0 - 0,NULL,NULL)); - } - - size_t const offset_lo = index_lo * ring->size.elem; - - cl(EnqueueWriteBuffer(cq, - snap->drN, - CL_FALSE, - 0, - bytes_lo, - (skc_uchar*)extent->hw1 + offset_lo, - 0,NULL,event)); - -} - -void -skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs_snap * const snap) -{ - skc_runtime_device_temp_free(runtime,snap->drN,snap->id); - skc_extent_ring_snap_free(runtime,snap->snap); -} - -// -// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT -// - -void -skc_extent_phrwg_tdrNs_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent, - size_t const size) -{ - extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE -} - -void -skc_extent_phrwg_tdrNs_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent) -{ - skc_runtime_host_perm_free(runtime,extent->hrw); -} - -void -skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phrwg_tdrNs_snap * const snap) -{ - snap->snap = skc_extent_ring_snap_alloc(runtime,ring); -} - -void -skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent, - struct skc_extent_phrwg_tdrNs_snap * const snap, - cl_command_queue const cq, - cl_event * const event) -{ - struct skc_extent_ring const * const ring = snap->snap->ring; - - skc_uint const count = skc_extent_ring_snap_count(snap->snap); - size_t const size = count * ring->size.elem; - - snap->drN = skc_runtime_device_temp_alloc(runtime, - CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, - size,&snap->id,NULL); - - if (count == 0) - return; - - // possibly two copies - skc_uint const index_lo = snap->snap->reads & ring->size.mask; - skc_uint const count_max = ring->size.pow2 - index_lo; - skc_uint const count_lo = min(count_max,count); - size_t const bytes_lo = count_lo * ring->size.elem; - - if (count > count_max) - { - skc_uint const count_hi = count - count_max; - skc_uint const bytes_hi = count_hi * ring->size.elem; - - cl(EnqueueWriteBuffer(cq, - snap->drN, - CL_FALSE, - bytes_lo, - bytes_hi, - extent->hrw, // offset_hi = 0 - 0,NULL,NULL)); - } - - size_t offset_lo = index_lo * ring->size.elem; - - cl(EnqueueWriteBuffer(cq, - snap->drN, - CL_FALSE, - 0, - bytes_lo, - (skc_uchar*)extent->hrw + offset_lo, - 0,NULL,event)); - -} - -void -skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs_snap * const snap) -{ - skc_runtime_device_temp_free(runtime,snap->drN,snap->id); - skc_extent_ring_snap_free(runtime,snap->snap); -} - -// -// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT -// -// Note that because the ring and snapshot are both in host memory and -// the snapshot blocks progress until freed we can simply point the -// fake ephemeral snapshot at the ring's durable extent. -// - -void -skc_extent_phrwg_thr1s_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent, - size_t const size) -{ - extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE -} - -void -skc_extent_phrwg_thr1s_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent) -{ - skc_runtime_host_perm_free(runtime,extent->hrw); -} - -void -skc_extent_phrwg_thr1s_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phrwg_thr1s_snap * const snap) -{ - snap->snap = skc_extent_ring_snap_alloc(runtime,ring); -} - -void -skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent, - struct skc_extent_phrwg_thr1s_snap * const snap) -{ - struct skc_extent_ring const * const ring = snap->snap->ring; - - skc_uint const count = skc_extent_ring_snap_count(snap->snap); - skc_uint const index_lo = snap->snap->reads & ring->size.mask; - skc_uint const count_max = ring->size.pow2 - index_lo; - - snap->count.lo = min(count_max,count); - snap->hr1.lo = (skc_uchar*)extent->hrw + (index_lo * ring->size.elem); - - if (count > count_max) - { - snap->count.hi = count - count_max; - snap->hr1.hi = extent->hrw; - } - else - { - snap->count.hi = 0; - snap->hr1.hi = NULL; - } -} - -void -skc_extent_phrwg_thr1s_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s_snap * const snap) -{ - skc_extent_ring_snap_free(runtime,snap->snap); -} - -// -// -// diff --git a/src/compute/skc/extent_cl_12.h b/src/compute/skc/extent_cl_12.h deleted file mode 100644 index 47ba951bb3..0000000000 --- a/src/compute/skc/extent_cl_12.h +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include - -#include "suballocator.h" -#include "extent_ring.h" - -// -// Legend: -// -// p : durable -// t : ephemeral -// h : host -// d : device -// r : read -// w : write -// 1 : once -- e.g. w1 is 'write-once' -// N : many -- e.g. rN is 'read-many' -// g : ring -// s : ring snapshot -// -// Notes: -// -// rw : for now, read-write implies read-write many -// - -// -// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY -// - -struct skc_extent_phrw -{ - void * hrw; -}; - -void -skc_extent_phrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrw * const extent, - size_t const size); - -void -skc_extent_phrw_free(struct skc_runtime * const runtime, - struct skc_extent_phrw * const extent); - -// -// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP -// - -struct skc_extent_pdrw -{ - cl_mem drw; -}; - -void -skc_extent_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_pdrw * const extent, - size_t const size); - -void -skc_extent_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_pdrw * const extent); - -// -// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING -// - -struct skc_extent_tdrw -{ - size_t size; - cl_mem drw; - skc_subbuf_id_t id; -}; - -void -skc_extent_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_tdrw * const extent, - size_t const size); - -void -skc_extent_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_tdrw * const extent); - -void -skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -// -// DURABLE SMALL EXTENTS BACKING ATOMICS -// - -struct skc_extent_phr_pdrw -{ - size_t size; // must be multiple of words - void * hr; - cl_mem drw; -}; - -void -skc_extent_phr_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phr_pdrw * const extent, - size_t const size); - -void -skc_extent_phr_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_phr_pdrw * const extent); - -void -skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -// -// EPHEMERAL SMALL EXTENTS BACKING ATOMICS -// - -struct skc_extent_thr_tdrw -{ - size_t size; // must be multiple of words - - void * hr; - cl_mem drw; - - struct { - skc_subbuf_id_t hr; - skc_subbuf_id_t drw; - } id; -}; - -void -skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_thr_tdrw * const extent, - size_t const size); - -void -skc_extent_thr_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_thr_tdrw * const extent); - -void -skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -// -// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT -// - -struct skc_extent_phw1g_tdrNs -{ - void * hw1; -}; - -struct skc_extent_phw1g_tdrNs_snap -{ - struct skc_extent_ring_snap * snap; - cl_mem drN; - skc_subbuf_id_t id; -}; - -void -skc_extent_phw1g_tdrNs_alloc(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent, - size_t const size); - -void -skc_extent_phw1g_tdrNs_free(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent); - -void -skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phw1g_tdrNs_snap * const snap); - -void -skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs * const extent, - struct skc_extent_phw1g_tdrNs_snap * const snap, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phw1g_tdrNs_snap * const snap); - -// -// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT -// - -struct skc_extent_phrwg_tdrNs -{ - void * hrw; -}; - -struct skc_extent_phrwg_tdrNs_snap -{ - struct skc_extent_ring_snap * snap; - cl_mem drN; - skc_subbuf_id_t id; -}; - -void -skc_extent_phrwg_tdrNs_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent, - size_t const size); - -void -skc_extent_phrwg_tdrNs_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent); - -void -skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phrwg_tdrNs_snap * const snap); - -void -skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs * const extent, - struct skc_extent_phrwg_tdrNs_snap * const snap, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_tdrNs_snap * const snap); - -// -// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT -// -// Note that because the ring and snapshot are both in host memory and -// the snapshot blocks progress until freed we can simply point the -// fake ephemeral snapshot at the ring's durable extent. -// - -struct skc_extent_phrwg_thr1s -{ - void * hrw; -}; - -struct skc_extent_phrwg_thr1s_snap -{ - struct skc_extent_ring_snap * snap; - - struct { - skc_uint lo; - skc_uint hi; - } count; - - struct { - void * lo; - void * hi; - } hr1; -}; - -void -skc_extent_phrwg_thr1s_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent, - size_t const size); - -void -skc_extent_phrwg_thr1s_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent); - -void -skc_extent_phrwg_thr1s_snap_init(struct skc_runtime * const runtime, - struct skc_extent_ring * const ring, - struct skc_extent_phrwg_thr1s_snap * const snap); - -void -skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s * const extent, - struct skc_extent_phrwg_thr1s_snap * const snap); - -void -skc_extent_phrwg_thr1s_snap_free(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s_snap * const snap); - -// -// EPHEMERAL MAPPING -// -// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -#if 0 -struct skc_extent_thrw_tdrw -{ - size_t size; - cl_mem drw; - skc_subbuf_id_t id; -}; - -void -skc_extent_thrw_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_thrw_tdrw * const extent, - size_t const size); - -void -skc_extent_thrw_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_thrw_tdrw * const extent); - -void * -skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event); - -void * -skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent, - void * const hrN, - cl_command_queue const cq, - cl_event * const event); -#endif - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -struct skc_extent_phrw_pdrw -{ - size_t size; - cl_mem drw; -}; - -void -skc_extent_phrw_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrw_pdrw * const extent, - size_t const size); - -void -skc_extent_phrw_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_phrw_pdrw * const extent); - -void * -skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event); - -void * -skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent, - void * const hrN, - cl_command_queue const cq, - cl_event * const event); - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO R/O HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -struct skc_extent_phrN_pdwN -{ - size_t size; - cl_mem dwN; -}; - -void -skc_extent_phrN_pdwN_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrN_pdwN * const extent, - size_t const size); - -void -skc_extent_phrN_pdwN_free(struct skc_runtime * const runtime, - struct skc_extent_phrN_pdwN * const extent); - -void * -skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event); - -void * -skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent, - void * const hrN, - cl_command_queue const cq, - cl_event * const event); - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO W/O HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -struct skc_extent_phwN_pdrN -{ - size_t size; - cl_mem drN; -}; - -void -skc_extent_phwN_pdrN_alloc(struct skc_runtime * const runtime, - struct skc_extent_phwN_pdrN * const extent, - size_t const size); - -void -skc_extent_phwN_pdrN_free(struct skc_runtime * const runtime, - struct skc_extent_phwN_pdrN * const extent); - -void * -skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event); - -void * -skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent, - cl_command_queue const cq, - cl_event * const event); - -void -skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent, - void * const hwm, - cl_command_queue const cq, - cl_event * const event); - -// -// -// diff --git a/src/compute/skc/extent_cl_12_unified.c b/src/compute/skc/extent_cl_12_unified.c deleted file mode 100644 index 69c669ad54..0000000000 --- a/src/compute/skc/extent_cl_12_unified.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTE THAT NONE OF THESE EXTENTS CHECK FOR ZERO-SIZED ALLOCATIONS. -// THAT'S OK FOR NOW. -// - -#include - -#include "runtime_cl_12.h" -#include "extent_cl_12.h" -#include "common/cl/assert_cl.h" - -// -// EPHEMERAL MAPPING -// -// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -#if 0 - -#pragma message("struct skc_extent_thrw_tdrw will be removed once the sorter is installed.") - -void -skc_extent_thrw_tdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_thrw_tdrw * const extent, - size_t const size) -{ - extent->drw = skc_runtime_device_temp_alloc(runtime, - CL_MEM_READ_WRITE /* | CL_MEM_ALLOC_HOST_PTR */, - size,&extent->id,&extent->size); -} - -void -skc_extent_thrw_tdrw_free(struct skc_runtime * const runtime, - struct skc_extent_thrw_tdrw * const extent) -{ - skc_runtime_device_temp_free(runtime,extent->drw,extent->id); -} - -void * -skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event) -{ - cl_int cl_err; - - void * hrw = clEnqueueMapBuffer(cq,extent->drw, - CL_FALSE, - CL_MAP_READ | CL_MAP_WRITE,0,size, - 0,NULL,event,&cl_err); cl_ok(cl_err); - - return hrw; -} - -void * -skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - return skc_extent_thrw_tdrw_map_size(extent,extent->size,cq,event); -} - -void -skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent, - void * const hrw, - cl_command_queue const cq, - cl_event * const event) -{ - cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event)); -} - -#endif - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -void -skc_extent_phrw_pdrw_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrw_pdrw * const extent, - size_t const size) -{ - cl_int cl_err; - - extent->size = size; - extent->drw = clCreateBuffer(runtime->cl.context, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - size,NULL,&cl_err); cl_ok(cl_err); -} - -void -skc_extent_phrw_pdrw_free(struct skc_runtime * const runtime, - struct skc_extent_phrw_pdrw * const extent) -{ - cl(ReleaseMemObject(extent->drw)); -} - -void * -skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event) -{ - cl_int cl_err; - - void * hrw = clEnqueueMapBuffer(cq,extent->drw, - CL_FALSE, - CL_MAP_READ | CL_MAP_WRITE,0,size, - 0,NULL,event,&cl_err); cl_ok(cl_err); - - return hrw; -} - -void * -skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - return skc_extent_phrw_pdrw_map_size(extent,extent->size,cq,event); -} - -void -skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent, - void * const hrw, - cl_command_queue const cq, - cl_event * const event) -{ - cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event)); -} - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO R/O HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -void -skc_extent_phrN_pdwN_alloc(struct skc_runtime * const runtime, - struct skc_extent_phrN_pdwN * const extent, - size_t const size) -{ - cl_int cl_err; - - extent->size = size; - extent->dwN = clCreateBuffer(runtime->cl.context, - CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, - size,NULL,&cl_err); cl_ok(cl_err); -} - -void -skc_extent_phrN_pdwN_free(struct skc_runtime * const runtime, - struct skc_extent_phrN_pdwN * const extent) -{ - cl(ReleaseMemObject(extent->dwN)); -} - -void * -skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event) -{ - cl_int cl_err; - - void * hrN = clEnqueueMapBuffer(cq,extent->dwN, - CL_FALSE, - CL_MAP_READ,0,size, - 0,NULL,event,&cl_err); cl_ok(cl_err); - - return hrN; -} - -void * -skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - return skc_extent_phrN_pdwN_map_size(extent,extent->size,cq,event); -} - -void -skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent, - void * const hrN, - cl_command_queue const cq, - cl_event * const event) -{ - cl(EnqueueUnmapMemObject(cq,extent->dwN,hrN,0,NULL,event)); -} - -// -// DURABLE MAPPING -// -// ENTIRE EXTENT MAPPED TO W/O HOST MEMORY -// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY -// -// Note: integrated vs. discrete GPUs will have different -// implementations because we don't want a GPU kernel repeatedly -// accessing pinned memory. -// - -void -skc_extent_phwN_pdrN_alloc(struct skc_runtime * const runtime, - struct skc_extent_phwN_pdrN * const extent, - size_t const size) -{ - cl_int cl_err; - - extent->size = size; - extent->drN = clCreateBuffer(runtime->cl.context, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - size,NULL,&cl_err); cl_ok(cl_err); -} - -void -skc_extent_phwN_pdrN_free(struct skc_runtime * const runtime, - struct skc_extent_phwN_pdrN * const extent) -{ - cl(ReleaseMemObject(extent->drN)); -} - -void * -skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent, - size_t const size, - cl_command_queue const cq, - cl_event * const event) -{ - cl_int cl_err; - - void * hwN = clEnqueueMapBuffer(cq,extent->drN, - CL_FALSE, - CL_MAP_WRITE,0,size, - 0,NULL,event,&cl_err); cl_ok(cl_err); - - return hwN; -} - -void * -skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent, - cl_command_queue const cq, - cl_event * const event) -{ - return skc_extent_phwN_pdrN_map_size(extent,extent->size,cq,event); -} - -void -skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent, - void * const hwN, - cl_command_queue const cq, - cl_event * const event) -{ - cl(EnqueueUnmapMemObject(cq,extent->drN,hwN,0,NULL,event)); -} - -// -// -// diff --git a/src/compute/skc/fills_expand.cl b/src/compute/skc/fills_expand.cl deleted file mode 100644 index b6f56794c5..0000000000 --- a/src/compute/skc/fills_expand.cl +++ /dev/null @@ -1,309 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "atomic_cl.h" -#include "block.h" -#include "path.h" -#include "common.h" - -// -// -// - -#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) - -#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) -#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) - -#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// -// - -#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// -// - -#if ( SKC_FILLS_EXPAND_X == 1 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_FILLS_EXPAND_X == 2 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_FILLS_EXPAND_X == 4 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_FILLS_EXPAND_X == 8 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_FILLS_EXPAND_X == 16) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_FILLS_EXPAND_X" -#endif - -// -// Fill and rasterize cmds only differ in their first word semantics -// - -union skc_cmd_expand -{ - union skc_cmd_fill fill; - union skc_cmd_rasterize rasterize; -}; - -// -// -// - -union skc_path_elem -{ - skc_uint u32; - skc_float f32; -}; - -// -// COMPILE-TIME AND RUN-TIME MACROS -// - -#define SKC_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -#define SKC_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// -// - -void -skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out, - skc_uint * const out_idx, - union skc_cmd_expand * const cmd, - union skc_path_elem const e, - skc_uint const e_idx) -{ - // - // FIXME -- we can append a large number of nodeword indices to a - // local SMEM queue and flush when full. It may or may not be a - // performance win on some architectures. - // - skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT; - skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0); - - cmd->rasterize.nodeword = e_idx; - - if (is_elem) { - cmds_out[*out_idx + offset] = cmd->rasterize; - } - - *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1); -} - -// -// -// - -__kernel -SKC_FILLS_EXPAND_KERNEL_ATTRIBS -void -skc_kernel_fills_expand(__global union skc_path_elem const * const blocks, - __global skc_uint volatile * const atomics, - __global skc_block_id_t const * const map, - __global union skc_cmd_fill const * const cmds_in, - __global union skc_cmd_rasterize * const cmds_out) -{ - // - // Need to harmonize the way we determine a subgroup's id. In this - // kernel it's not as important because no local memory is being - // used. Although the device/mask calc to determine subgroup and - // lanes is still proper, we might want to make it clearer that - // we're working with subgroups by using the subgroup API. - // - // every subgroup/simd that will work on the block loads the same command - // -#if (__OPENCL_VERSION__ < 200) - skc_uint const cmd_stride = get_num_sub_groups(); -#else - skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id(); - - // load fill command -- we reuse y component - union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] }; - - // get the path header block from the map - skc_block_id_t id = map[cmd.fill.path]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("expand[%u] = %u\n",cmd_idx,id); -#endif - - // - // blindly load all of the head elements into registers - // - skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - skc_uint count_nodes, count_prims; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ - count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \ - } \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \ - count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // debug of path head - // -#if 0 - skc_uint count_blocks; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ - count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - if (get_sub_group_local_id() == 0) - printf("path header = { %5u, %5u, %5u }\n", - count_blocks,count_nodes,count_prims); -#endif - - // - // acquire slots in the expanded cmd extent - // - // decrement prim_idx by 1 so we can use inclusive warp scan later - // - skc_uint out_idx = 0; - - if (get_sub_group_local_id() == 0) { - out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP - (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1; - } - - out_idx = sub_group_broadcast(out_idx,0); - - // - // process ids trailing the path header - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ - if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \ - h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \ - } \ - } \ - skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \ - head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, process the nodes - // - - // - // get id of next node - // - id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); - - // - // the following blocks are nodes - // - while (true) - { - // get index of each element - skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); - - // - // blindly load all of the node elements into registers - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // append all valid ids - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \ - node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // any more nodes? - if (--count_nodes == 0) - return; - - // - // get id of next node - // - id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); - } -} - -// -// -// diff --git a/src/compute/skc/handle_pool_cl_12.c b/src/compute/skc/handle_pool_cl_12.c deleted file mode 100644 index 65288c3656..0000000000 --- a/src/compute/skc/handle_pool_cl_12.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include - -// -// -// - -#include "common/cl/assert_cl.h" - -#include "block.h" -#include "grid.h" -#include "config_cl.h" -#include "runtime_cl_12.h" - -// -// FIXME -- these comments are now quite stale -// -// -// HANDLE/ACQUIRE RELEASE -// -// The runtime vends handles just in case we decide to exploit shared -// virtual memory. But for most platforms and devices we will have a -// pool of host-managed handles and on the device there will be a -// table that maps the host handle to a device-managed memory block. -// -// HANDLE READINESS -// -// A host handle may reference a path or a raster which is not ready -// for use further down the pipeline because it hasn't yet been -// processed by the device. -// -// The simplest scheme for providing every handle a readiness state is -// to build a map that that marks a new handle as being not-ready -// while being processed by a particular grid id. When the final -// sub-pipeline grid responsible for the path or raster is complete, -// then mark the handle as being ready and eventually return the grid -// id back to the pool. This can be performed on a separate thread. -// -// The side-benefit of this approach is that a handle's reference -// count integral type can spare some bits for its associated grid id. -// -// A more memory-intensive approach uses a 64-bit epoch+grid key and -// relies on the ~56 bits of epoch space to avoid any post -// sub-pipeline status update by assuming that a handle and grid will -// match or mismatch when queried. -// - -#define SKC_HANDLE_REFCNT_HOST_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,h) * 8) -#define SKC_HANDLE_REFCNT_DEVICE_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,d) * 8) - -#define SKC_HANDLE_REFCNT_HOST_MAX SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_HOST_BITS) -#define SKC_HANDLE_REFCNT_DEVICE_MAX SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_DEVICE_BITS) - -// -// -// - -static -void -skc_handle_reclaim_create(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool, - skc_handle_reclaim_type_e const reclaim_type, - skc_device_kernel_id const kernel_id) -{ - struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; - - // init counters - reclaim->bih.rem = 0; - - // acquire kernel - reclaim->kernel = skc_device_acquire_kernel(runtime->device,kernel_id); - reclaim->kernel_id = kernel_id; - - // set default args - cl(SetKernelArg(reclaim->kernel,0,SKC_CL_ARG(runtime->block_pool.ids.drw))); - cl(SetKernelArg(reclaim->kernel,1,SKC_CL_ARG(runtime->block_pool.blocks.drw))); - cl(SetKernelArg(reclaim->kernel,2,SKC_CL_ARG(runtime->block_pool.atomics.drw))); - cl(SetKernelArg(reclaim->kernel,3,SKC_CL_ARG(runtime->config->block_pool.ring_mask))); - cl(SetKernelArg(reclaim->kernel,4,SKC_CL_ARG(runtime->handle_pool.map.drw))); -} - -static -void -skc_handle_reclaim_dispose(struct skc_runtime * const runtime, - skc_handle_reclaim_type_e const reclaim_type) -{ - struct skc_handle_reclaim * const reclaim = runtime->handle_pool.reclaim + reclaim_type; - - cl(ReleaseKernel(reclaim->kernel)); -} - -// -// -// - -#define SKC_HANDLE_POOL_BLOCKS_PAD 8 - -void -skc_handle_pool_create(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool, - skc_uint const size, - skc_uint const width, - skc_uint const recs) -{ - skc_uint const blocks = (size + width - 1) / width; - skc_uint const blocks_padded = blocks + SKC_HANDLE_POOL_BLOCKS_PAD; - skc_uint const handles = blocks * width; - skc_uint const handles_padded = blocks_padded * width; - skc_uint const recs_padded = recs + 2; // one for pointer and one for head node - - skc_extent_pdrw_alloc(runtime,&handle_pool->map,handles * sizeof(skc_block_id_t)); - - handle_pool->handle.indices = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles_padded * sizeof(*handle_pool->handle.indices)); - handle_pool->handle.refcnts = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles * sizeof(*handle_pool->handle.refcnts)); - handle_pool->block.indices = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,blocks_padded * sizeof(*handle_pool->block.indices)); - handle_pool->recs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,recs_padded * sizeof(*handle_pool->recs)); - - // initialize handles and refcnts - for (skc_uint ii=0; iihandle.indices[ii] = ii; - - for (skc_uint ii=0; iihandle.refcnts[ii].hd = 0; - - handle_pool->handle.count = handles; - - // initialize block accounting - for (skc_uint ii=0; iiblock.indices[ii] = ii; - - handle_pool->block.count = blocks_padded; - handle_pool->block.width = width; - - handle_pool->block.tos = blocks; // pop = pre-decrement / push = post-increment - handle_pool->block.bos = blocks; // pop = post-increment / push = pre-decrement - - // initialize recs -- first two elements are interpreted differently - handle_pool->recs[0].runtime = runtime; - handle_pool->recs[1] = (union skc_handle_reclaim_rec){ .rem = recs, .head = 2 }; - - for (skc_uint ii=2; iirecs[ii] = (union skc_handle_reclaim_rec){ .index = ii, .next = ii+1 }; - - handle_pool->recs[recs_padded-1].next = SKC_UINT_MAX; - - // initialize acquire - handle_pool->acquire.rem = 0; - - // create reclaimers - skc_handle_reclaim_create(runtime, - handle_pool, - SKC_HANDLE_RECLAIM_TYPE_PATH, - SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM); - - skc_handle_reclaim_create(runtime, - handle_pool, - SKC_HANDLE_RECLAIM_TYPE_RASTER, - SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM); -} - -// -// -// - -void -skc_handle_pool_dispose(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool) -{ - skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER); - skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH); - - skc_runtime_host_perm_free(runtime,handle_pool->recs); - skc_runtime_host_perm_free(runtime,handle_pool->block.indices); - skc_runtime_host_perm_free(runtime,handle_pool->handle.refcnts); - skc_runtime_host_perm_free(runtime,handle_pool->handle.indices); - - skc_extent_pdrw_free(runtime,&handle_pool->map); -} - -// -// -// - -static -skc_uint -skc_handle_pool_block_readable_pop(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool) -{ - SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.tos == 0); - - skc_uint const index = handle_pool->block.indices[--handle_pool->block.tos]; - -#if 0 - skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width; - for (skc_uint ii=0; iiblock.width; ii++) - printf("R-: %u\n",*--handles); -#endif - - return index; -} - -static -void -skc_handle_pool_block_readable_push(struct skc_handle_pool * const handle_pool, - skc_uint const index) -{ - handle_pool->block.indices[handle_pool->block.tos++] = index; - -#if 0 - skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width; - for (skc_uint ii=0; iiblock.width; ii++) - printf("R+: %u\n",*--handles); -#endif -} - - -static -skc_uint -skc_handle_pool_block_writable_pop(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool) -{ - SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.bos == handle_pool->block.count); - - return handle_pool->block.indices[handle_pool->block.bos++]; -} - -static -void -skc_handle_pool_block_writable_push(struct skc_handle_pool * const handle_pool, - skc_uint const block_idx) -{ - handle_pool->block.indices[--handle_pool->block.bos] = block_idx; -} - -// -// May need to acquire the path or raster handle *early* just to be -// sure one exists -// - -skc_handle_t -skc_runtime_handle_device_acquire(struct skc_runtime * const runtime) -{ - struct skc_handle_pool * const handle_pool = &runtime->handle_pool; - - // acquire a block of handles at a time - if (handle_pool->acquire.rem == 0) - { - skc_uint const block_idx = skc_handle_pool_block_readable_pop(runtime,handle_pool); - - handle_pool->acquire.block = block_idx; - handle_pool->acquire.rem = handle_pool->block.width; - handle_pool->acquire.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width; - } - - // load handle from next block slot - skc_uint const rem = --handle_pool->acquire.rem; - skc_handle_t const handle = *--handle_pool->acquire.handles; - - // initialize refcnt for handle - handle_pool->handle.refcnts[handle] = (union skc_handle_refcnt){ .h = 1, .d = 1 }; - - // if this was the last handle in the block then move the block id - // to the reclamation stack to be used as a scratchpad - if (rem == 0) { - skc_handle_pool_block_writable_push(handle_pool,handle_pool->acquire.block); - } - - return handle; -} - -// -// -// - -static -void -skc_handle_reclaim_completion(union skc_handle_reclaim_rec * const recN) -{ - // get root rec which contains pointer to runtime - union skc_handle_reclaim_rec * const rec0 = recN - recN->index; - union skc_handle_reclaim_rec * const rec1 = rec0 + 1; - - // return block for reading - skc_handle_pool_block_readable_push(&rec0->runtime->handle_pool,recN->block); - - // recN is new head of list - recN->next = rec1->head; - rec1->head = recN->index; - rec1->rem += 1; -} - -static -void -skc_handle_reclaim_cb(cl_event event, cl_int status, union skc_handle_reclaim_rec * const recN) -{ - SKC_CL_CB(status); - - union skc_handle_reclaim_rec * const rec0 = recN - recN->index; - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(rec0->runtime->scheduler,skc_handle_reclaim_completion,recN); -} - -// -// FIXME -- is there an issue launching on the host thread? -// - -static -void -skc_handle_reclaim_launch(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool, - struct skc_handle_reclaim * const reclaim, - union skc_handle_reclaim_rec * const recN) -{ - cl(SetKernelArg(reclaim->kernel, - 5, - handle_pool->block.width * sizeof(skc_handle_t), - reclaim->bih.handles)); - - // acquire a cq - cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); - - cl_event complete; - - // the kernel grid is shaped by the target device - skc_device_enqueue_kernel(runtime->device, - reclaim->kernel_id, - cq, - reclaim->kernel, - handle_pool->block.width, - 0,NULL,&complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_handle_reclaim_cb,recN)); - cl(ReleaseEvent(complete)); - - // kickstart kernel execution - cl(Flush(cq)); - - // release the cq - skc_runtime_release_cq_in_order(runtime,cq); -} - -// -// reclaim a handle -// - -static -union skc_handle_reclaim_rec * -skc_handle_acquire_reclaim_rec(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool) -{ - union skc_handle_reclaim_rec * const rec1 = handle_pool->recs + 1; - - SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,rec1->rem == 0); - - union skc_handle_reclaim_rec * const recN = handle_pool->recs + rec1->head; - - rec1->head = recN->next; - rec1->rem -= 1; - - // fprintf(stderr,"rec1->rem = %u\n",rec1->rem); - - return recN; -} - -static -void -skc_runtime_device_reclaim(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool, - struct skc_handle_reclaim * const reclaim, - skc_handle_t const handle) -{ - // grab a new block? - if (reclaim->bih.rem == 0) - { - skc_uint const block_idx = skc_handle_pool_block_writable_pop(runtime,handle_pool); - - reclaim->bih.block = block_idx; - reclaim->bih.rem = handle_pool->block.width; - reclaim->bih.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width; - } - - // store handle -- handle's refcnt was already set to {0:0} - *--reclaim->bih.handles = handle; - - // if block is full then launch reclamation kernel - if (--reclaim->bih.rem == 0) - { - union skc_handle_reclaim_rec * recN = skc_handle_acquire_reclaim_rec(runtime,handle_pool); - - recN->block = reclaim->bih.block; - - skc_handle_reclaim_launch(runtime,handle_pool,reclaim,recN); - } -} - -// -// Validate host-provided handles before retaining. -// -// Retain validation consists of: -// -// - correct handle type -// - handle is in range of pool -// - host refcnt is not zero -// - host refcnt is not at the maximum value -// -// After validation, retain the handles for the host -// - -static -skc_err -skc_runtime_handle_host_validated_retain(struct skc_runtime * const runtime, - skc_typed_handle_type_e const handle_type, - skc_typed_handle_t const * const typed_handles, - uint32_t const count) -{ - // - // FIXME -- test to make sure handles aren't completely out of range integers - // - - union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts; - - for (skc_uint ii=0; ii= runtime->handle_pool.handle.count) - { - return SKC_ERR_HANDLE_INVALID; - } - else - { - union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; - skc_uint const host = refcnt_ptr->h; - - if (host == 0) - { - return SKC_ERR_HANDLE_INVALID; - } - else if (host == SKC_HANDLE_REFCNT_HOST_MAX) - { - return SKC_ERR_HANDLE_OVERFLOW; - } - } - } - } - - // - // all the handles validated, so retain them all.. - // - for (skc_uint ii=0; iideps,rasters,count); - - return SKC_ERR_SUCCESS; -} - -skc_err -skc_runtime_path_host_flush(struct skc_runtime * const runtime, - skc_path_t const * paths, - uint32_t count) -{ - skc_grid_deps_force(runtime->deps,paths,count); - - return SKC_ERR_SUCCESS; -} - -// -// Validate host-provided handles before releasing. -// -// Release validation consists of: -// -// - correct handle type -// - handle is in range of pool -// - host refcnt is not zero -// -// After validation, release the handles for the host -// - -static -skc_err -skc_runtime_host_validated_release(struct skc_runtime * const runtime, - skc_typed_handle_type_e const type, - skc_handle_reclaim_type_e const reclaim_type, - skc_typed_handle_t const * const handles, - uint32_t const count) -{ - struct skc_handle_pool * const handle_pool = &runtime->handle_pool; - union skc_handle_refcnt * const refcnts = handle_pool->handle.refcnts; - - for (skc_uint ii=0; ii= handle_pool->handle.count) - { - return SKC_ERR_HANDLE_INVALID; - } - else - { - union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; - skc_uint const host = refcnt_ptr->h; - - if (host == 0) - { - return SKC_ERR_HANDLE_INVALID; - } - } - } - } - - // - // all the handles validated, so release them all.. - // - struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; - - for (skc_uint ii=0; iihandle_pool.handle.refcnts; - - while (count-- > 0) - { - skc_typed_handle_t const typed_handle = *handles++; - - if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type)) - { - return SKC_ERR_HANDLE_INVALID; - } - else - { - skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle); - - if (handle >= runtime->handle_pool.handle.count) - { - return SKC_ERR_HANDLE_INVALID; - } - else - { - union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; - union skc_handle_refcnt refcnt = *refcnt_ptr; - - if (refcnt.h == 0) - { - return SKC_ERR_HANDLE_INVALID; - } - else if (refcnt.d == SKC_HANDLE_REFCNT_DEVICE_MAX) - { - return SKC_ERR_HANDLE_OVERFLOW; - } - } - } - } - - return SKC_ERR_SUCCESS; -} - -// -// After validation, retain the handles for the device -// - -void -skc_runtime_handle_device_retain(struct skc_runtime * const runtime, - skc_handle_t const * handles, - uint32_t count) -{ - union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts; - - while (count-- > 0) - refcnts[SKC_TYPED_HANDLE_TO_HANDLE(*handles++)].d++; -} - -// -// Release the device-held handles -- no validation required! -// - -static -void -skc_runtime_handle_device_release(struct skc_runtime * const runtime, - skc_handle_reclaim_type_e const reclaim_type, - skc_handle_t const * handles, - skc_uint count) -{ - struct skc_handle_pool * const handle_pool = &runtime->handle_pool; - union skc_handle_refcnt * const refcnts = handle_pool->handle.refcnts; - struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; - - while (count-- > 0) { - skc_handle_t const handle = *handles++; - union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; - union skc_handle_refcnt refcnt = *refcnt_ptr; - - refcnt.d -= 1; - *refcnt_ptr = refcnt; - -#if 0 - printf("%8u = { %u, %u }\n",handle,refcnt.h,refcnt.d); -#endif - - if (refcnt.hd == 0) { - skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle); - } - } -} - -// -// -// - -void -skc_runtime_path_device_release(struct skc_runtime * const runtime, - skc_handle_t const * handles, - skc_uint count) -{ - skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH,handles,count); -} - -void -skc_runtime_raster_device_release(struct skc_runtime * const runtime, - skc_handle_t const * handles, - skc_uint count) -{ - skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER,handles,count); -} - -// -// -// diff --git a/src/compute/skc/handle_pool_cl_12.h b/src/compute/skc/handle_pool_cl_12.h deleted file mode 100644 index 4fefae3552..0000000000 --- a/src/compute/skc/handle_pool_cl_12.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "macros.h" -#include "handle.h" -#include "extent_cl_12.h" -#include "device_cl_12.h" - -// -// FIXME -- THIS DOCUMENTATION IS STALE NOW THAT A REFERENCE COUNT REP -// IS A {HOST:DEVICE} PAIR. -// -// Host-side handle pool -// -// The bulk size of the three extents is currently 6 bytes of overhead -// per number of host handles. The number of host handles is usually -// less than the number of blocks in the pool. Note that the maximum -// number of blocks is 2^27. -// -// A practical instantiation might provide a combined 2^20 path and -// raster host handles. This would occupy 6 MB of host RAM for the -// 32-bit handle, 8-bit reference count and 8-bit handle-to-grid map. -// -// Also note that we could use isolated/separate path and raster block -// pools. Worst case, this would double the memory footprint of SKC. -// -// Host-side handle reference count -// -// [0 ] : release -// [1..UMAX] : retain -// -// In a garbage-collected environment we might want to rely on an -// existing mechanism for determing whether a handle is live. -// -// Otherwise, we probably want to have a 16 or 32-bit ref count. -// -// The handle reference count is defensive and will not allow the host -// to underflow a handle that's still retained by the pipeline. -// -// The single reference counter is split into host and device counts. -// - -union skc_handle_refcnt -{ - skc_ushort hd; // host and device - - struct { - skc_uchar h; // host - skc_uchar d; // device - }; -}; - -SKC_STATIC_ASSERT(SKC_MEMBER_SIZE(union skc_handle_refcnt,hd) == - SKC_MEMBER_SIZE(union skc_handle_refcnt,h) + - SKC_MEMBER_SIZE(union skc_handle_refcnt,d)); - -// -// -// - -struct skc_handle_bih -{ - skc_uint block; - skc_uint rem; - skc_handle_t * handles; -}; - -struct skc_handle_reclaim -{ - struct skc_handle_bih bih; - - cl_kernel kernel; - skc_device_kernel_id kernel_id; -}; - -union skc_handle_reclaim_rec -{ - // ELEMENT 0 - struct skc_runtime * runtime; - - // ELEMENT 1 - struct { - skc_uint rem; // # of available records - skc_uint head; // index of first record - }; - - // ELEMENTS 2+ - struct { - skc_uint index; // index of this record -- never modified - union { - skc_uint next; // index of next record - skc_uint block; // block index of reclaimed handles - }; - }; -}; - -SKC_STATIC_ASSERT(sizeof(union skc_handle_reclaim_rec) == sizeof(skc_uint2)); - -// -// -// - -typedef enum skc_handle_reclaim_type_e { - - SKC_HANDLE_RECLAIM_TYPE_PATH, - SKC_HANDLE_RECLAIM_TYPE_RASTER, - - SKC_HANDLE_RECLAIM_TYPE_COUNT - -} skc_handle_reclaim_type_e; - -struct skc_handle_pool -{ - // - // FIXME -- should we be pedantic and make these always-host-side - // allocations "extents" as well? I think it's OK not being an - // extent structure for now and is mostly consistent with the rest - // of the code. - // - // FIXME -- the cbs[] array is a little idiosyncratic but the intent - // is to avoid storing the 64-bit backpointer inside of every single - // record. This can be harmonized later. Note that only a few - // hundred outstanding callbacks would represent many many subgroups - // of work and would fully occupy the GPU (if we allow it). - // - // - struct skc_extent_pdrw map; // device-managed extent mapping a host handle to device block id - - struct { - skc_handle_t * indices; // array of individual host handles -- fragmented into blocks - union skc_handle_refcnt * refcnts; // array of reference counts indexed by an individual handle - skc_uint count; - } handle; - - struct { - skc_uint * indices; // stack of indices to fixed-size blocks of host handles - skc_uint count; // number of handles -- valid from [0,size) - skc_uint width; // width of a fixed-size block of handles - skc_uint tos; // grows upward / push++ / --pop / # fixed-size blocks for reading - skc_uint bos; // grows downward / --push / pop++ / # fixed-size blocks for writing - } block; - - union skc_handle_reclaim_rec * recs; // array of reclaim records - - struct skc_handle_bih acquire; - struct skc_handle_reclaim reclaim[SKC_HANDLE_RECLAIM_TYPE_COUNT]; -}; - -// -// -// - -void -skc_handle_pool_create(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool, - skc_uint const size, - skc_uint const width, - skc_uint const recs); - -void -skc_handle_pool_dispose(struct skc_runtime * const runtime, - struct skc_handle_pool * const handle_pool); - -// -// -// diff --git a/src/compute/skc/interop.c b/src/compute/skc/interop.c deleted file mode 100644 index 6697bb7e83..0000000000 --- a/src/compute/skc/interop.c +++ /dev/null @@ -1,629 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include - -// -// -// - -#include -#include -#include -#include - -// -// -// - -#include "common/cl/assert_cl.h" -#include "types.h" - -// -// -// - -#include "interop.h" -#include "context.h" -#include "runtime_cl_12.h" - -// -// -// - -#include "svg2skc/transform_stack.h" - -// -// -// - -#if 1 -#define SKC_IMAGE_FORMAT GL_RGBA8 -#else -#define SKC_IMAGE_FORMAT GL_RGBA16F -#endif - -// -// -// - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -// -// -// - -struct skc_interop_fb -{ - cl_context context; - - GLuint fbo; - GLuint rbo; - - cl_mem mem; - - int width; - int height; - - bool is_srgb; - bool is_vsync_on; - bool is_fullscreen; - bool is_iconified; - bool is_resized; - bool is_spinning; - bool is_info; - - skc_float scale; - skc_float2 translate; - float rotate_theta; -}; - -static struct skc_interop_fb fb = - { - .mem = NULL, - - .is_srgb = true, - .is_vsync_on = false, - .is_fullscreen = false, - .is_iconified = false, - .is_resized = true, - .is_spinning = false, - .is_info = false, - - .scale = 1.0f, - .translate = { 0.0f, 0.0f }, - .rotate_theta = 0.0f - }; - -// -// FPS COUNTER FROM HERE: -// -// http://antongerdelan.net/opengl/glcontext2.html -// - -static -void -skc_interop_fps(GLFWwindow * window) -{ - if (fb.is_fullscreen) - return; - - // static fps counters - static double stamp_prev = 0.0; - static int frame_count = 0; - - // locals - double const stamp_curr = glfwGetTime(); - double const elapsed = stamp_curr - stamp_prev; - - if (elapsed >= 0.5) - { - stamp_prev = stamp_curr; - - double const fps = (double)frame_count / elapsed; - - char tmp[64]; - - sprintf_s(tmp,64,"(%d x %d) - VSync %s - sRGB %s - FPS: %.2f", - fb.width,fb.height, - fb.is_vsync_on ? "ON" : "OFF", - fb.is_srgb ? "ENABLED" : "DISABLED", - fps); - - glfwSetWindowTitle(window,tmp); - - frame_count = 0; - } - - frame_count++; -} - -// -// INITIALIZE GLFW/GLAD -// - -static -void -skc_interop_error_callback(int error, char const * description) -{ - fputs(description,stderr); -} - -// -// -// - -static -void -skc_interop_iconify_callback(GLFWwindow * window, int iconified) -{ - fb.is_iconified = iconified; -} - -// -// -// - -static -void -skc_interop_key_callback(GLFWwindow * window, int key, int scancode, int action, int mods) -{ - if (action == GLFW_RELEASE) - return; - - switch (key) - { - case GLFW_KEY_EQUAL: - fb.rotate_theta = 0.0f; - break; - - case GLFW_KEY_I: - fb.is_info = true; - break; - - case GLFW_KEY_R: - fb.is_spinning ^= true; - break; - - case GLFW_KEY_S: - fb.is_srgb ^= true; - if (fb.is_srgb) - glEnable(GL_FRAMEBUFFER_SRGB); - else - glDisable(GL_FRAMEBUFFER_SRGB); - break; - - case GLFW_KEY_V: - fb.is_vsync_on ^= true; - glfwSwapInterval(fb.is_vsync_on ? 1 : 0); - break; - - case GLFW_KEY_W: - glfwSetWindowSize(window,1024,1024); - break; - - case GLFW_KEY_ESCAPE: - glfwSetWindowShouldClose(window,GL_TRUE); - break; - } -} - -static -void -skc_interop_window_size_callback(GLFWwindow * window, int width, int height) -{ - fb.width = width; - fb.height = height; - fb.is_resized = true; - -#if 0 - skc_render_kernel_set_clip(0,0,width,height); -#endif -} - -static -void -skc_interop_scale(double const scale_offset) -{ -#define SKC_SCALE_FACTOR 1.05 - - static double scale_exp = 0.0; - - scale_exp += scale_offset; - fb.scale = (float)pow(SKC_SCALE_FACTOR,scale_exp); -} - -static -void -skc_interop_scroll_callback(GLFWwindow * window, double xoffset, double yoffset) -{ - bool const ctrl = - (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL) == GLFW_PRESS) || - (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS); - - if (!ctrl) - return; - - skc_interop_scale(yoffset); -} - -static -void -skc_interop_translate(float const dx, float const dy) -{ - float const dx_scaled = dx / fb.scale; - float const dy_scaled = dy / fb.scale; - - float const cos_theta = cosf(fb.rotate_theta); // replace with cospi if available - float const sin_theta = sinf(fb.rotate_theta); // replace with sinpi if available - - fb.translate.x += dx_scaled*cos_theta + dy_scaled*sin_theta; - fb.translate.y += dy_scaled*cos_theta - dx_scaled*sin_theta; -} - -static -void -skc_interop_cursor_position_callback(GLFWwindow * window, double x, double y) -{ - int const state = glfwGetMouseButton(window,GLFW_MOUSE_BUTTON_LEFT); - - static bool is_mouse_dragging = false; - static float x_prev=0.0, y_prev=0.0; - - float const mx = (float)x; - float const my = (float)y; - - if (state == GLFW_PRESS) - { - if (is_mouse_dragging) - { - const bool ctrl = - (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL) == GLFW_PRESS) || - (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS); - - if (ctrl) - { - float const cx = 0.5f * fb.width; - float const cy = 0.5f * fb.height; - - // find angle between mouse and center - float const vx = x_prev - cx; - float const vy = y_prev - cy; - - float const wx = mx - cx; - float const wy = my - cy; - - float const len = sqrtf((vx*vx + vy*vy) * (wx*wx + wy*wy)); - - if (len > 0.0f) - { - float const dot = vx*wx + vy*wy; - float const da = acosf(dot / len); - - if (vx*wy - vy*wx >= 0.0f) - fb.rotate_theta += da; - else - fb.rotate_theta -= da; - - fb.rotate_theta = fmodf(fb.rotate_theta,(float)(M_PI*2.0)); - } - } - else - { - skc_interop_translate(mx - x_prev, - my - y_prev); - } - } - else - { - is_mouse_dragging = true; - } - - x_prev = mx; - y_prev = my; - } - else - { - is_mouse_dragging = false; - } -} - -// -// -// - -static -void -skc_interop_resize() -{ - fb.is_resized = false; - - // release the image2d - if (fb.mem != NULL) - cl(ReleaseMemObject(fb.mem)); - - // resize rbo - glNamedRenderbufferStorage(fb.rbo, - SKC_IMAGE_FORMAT, - fb.width, - fb.height); - - // attach rbo to fbo - glNamedFramebufferRenderbuffer(fb.fbo, - GL_COLOR_ATTACHMENT0, - GL_RENDERBUFFER, - fb.rbo); - // - // - // - cl_int cl_err; - - fb.mem = clCreateFromGLRenderbuffer(fb.context, - CL_MEM_WRITE_ONLY, - fb.rbo, - &cl_err); cl_ok(cl_err); - // - // for debugging porpoises! - // - cl_image_format format; - - cl(GetImageInfo(fb.mem, - CL_IMAGE_FORMAT, - sizeof(format), - &format, - NULL)); -} - -// -// -// - -static -void -skc_interop_acquire() -{ - // frame buffer object - glCreateFramebuffers(1,&fb.fbo); - - // render buffer object w/a color buffer - glCreateRenderbuffers(1,&fb.rbo); - - // size rbo - glNamedRenderbufferStorage(fb.rbo, - SKC_IMAGE_FORMAT, - fb.width, - fb.height); - - // attach rbo to fbo - glNamedFramebufferRenderbuffer(fb.fbo, - GL_COLOR_ATTACHMENT0, - GL_RENDERBUFFER, - fb.rbo); -} - -void -skc_interop_register(skc_context_t context) -{ - fb.context = context->runtime->cl.context; -} - -// -// -// - -void -skc_interop_init(GLFWwindow * * window) -{ - // - // INITIALIZE GLFW/GLAD - // - glfwSetErrorCallback(skc_interop_error_callback); - - if (!glfwInit()) - exit(EXIT_FAILURE); - - GLFWmonitor * const primary = glfwGetPrimaryMonitor(); - GLFWvidmode const * const mode = glfwGetVideoMode(primary); - - if (fb.is_fullscreen) - { - fb.width = mode->width; - fb.height = mode->height; - } - else - { - fb.width = 1600; - fb.height = 1024; - } - - glfwWindowHint(GLFW_ALPHA_BITS, 0); - glfwWindowHint(GLFW_DEPTH_BITS, 0); - glfwWindowHint(GLFW_STENCIL_BITS, 0); - - glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); - - glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); - glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); - - glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); - - *window = glfwCreateWindow(fb.width,fb.height, - "Skia Compute", - fb.is_fullscreen ? primary : NULL, - NULL); - - if (*window == NULL) - { - glfwTerminate(); - exit(EXIT_FAILURE); - } - - glfwMakeContextCurrent(*window); - - // set up GLAD - gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); - - // ignore vsync for now - glfwSwapInterval(fb.is_vsync_on ? 1 : 0); - - // only copy r/g/b - glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); - - // enable SRGB, disable scissor - glEnable(GL_FRAMEBUFFER_SRGB); - glDisable(GL_SCISSOR_TEST); - - // - // SET USER POINTER AND CALLBACKS - // - glfwSetKeyCallback (*window,skc_interop_key_callback); - glfwSetFramebufferSizeCallback(*window,skc_interop_window_size_callback); - glfwSetScrollCallback (*window,skc_interop_scroll_callback); - glfwSetCursorPosCallback (*window,skc_interop_cursor_position_callback); - glfwSetWindowIconifyCallback (*window,skc_interop_iconify_callback); - - // - // - // - fprintf(stderr, - "GL_VENDOR : %s\n" - "GL_RENDERER : %s\n", - glGetString(GL_VENDOR), - glGetString(GL_RENDERER)); - - // - // acquire an FBO/RBO - // - skc_interop_acquire(); -} - -// -// -// - -#define SKC_ROTATE_STEP ((float)(M_PI / 180.0)) - -static -void -skc_interop_transform(struct skc_transform_stack * ts) -{ - // OpenGL'ism - skc_transform_stack_push_affine(ts, - 1.0f, 0.0f,0.0f, - 0.0f,-1.0f,(float)fb.height); - // multiply - skc_transform_stack_concat(ts); - - // spinner... - if (fb.is_spinning) - fb.rotate_theta = fmodf(fb.rotate_theta + SKC_ROTATE_STEP,(float)(M_PI*2.0)); - - // always rotate and scale around surface center point - skc_transform_stack_push_rotate_scale_xy(ts, - fb.rotate_theta, - fb.scale,fb.scale, - 0.5f*fb.width,0.5f*fb.height); - skc_transform_stack_concat(ts); - - // where did the mouse take us? - skc_transform_stack_push_translate(ts, - fb.translate.x,fb.translate.y); - skc_transform_stack_concat(ts); -} - - -void -skc_interop_poll(GLFWwindow * window, - struct skc_transform_stack * ts) -{ - // wait until uniconified - while (fb.is_iconified) - { - glfwWaitEvents(); - continue; - } - - // what's happended? - glfwPollEvents(); - - // resize? - if (fb.is_resized) - skc_interop_resize(); - - // monitor fps - skc_interop_fps(window); - - skc_interop_transform(ts); -} - -// -// -// - -void -skc_interop_blit(GLFWwindow * window) -{ - // blit skc rbo - glBlitNamedFramebuffer(fb.fbo,0, - 0,0,fb.width,fb.height, - 0,0,fb.width,fb.height, - GL_COLOR_BUFFER_BIT, - GL_NEAREST); - -#if 0 - // - // FIXME -- this clear does nothing! - // - // As a hack we're clearing the interop'd RBO with a - // clEnqueueFillImage(). - // - float const rgba[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; - // GLenum const attachments[] = { GL_COLOR_ATTACHMENT0 }; - // glInvalidateNamedFramebufferData(fb.fbo,1,attachments); - glClearNamedFramebufferfv(fb.fbo,GL_COLOR,0,rgba); -#endif - - // swap buffers - glfwSwapBuffers(window); -} - -// -// -// - -void * -skc_interop_get_fb(GLFWwindow * window) -{ - glFlush(); - - return fb.mem; -} - -// -// -// - -void -skc_interop_get_dim(uint32_t dim[2]) -{ - dim[0] = fb.width; - dim[1] = fb.height; -} - -// -// -// - - diff --git a/src/compute/skc/interop.h b/src/compute/skc/interop.h deleted file mode 100644 index 112d365764..0000000000 --- a/src/compute/skc/interop.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "skc.h" - -// -// -// - -void -skc_interop_init(GLFWwindow * * window); - -void -skc_interop_register(skc_context_t context); - -void -skc_interop_poll(GLFWwindow * window, - struct skc_transform_stack * ts); - -void * -skc_interop_get_fb(GLFWwindow * window); - -void -skc_interop_get_dim(uint32_t dim[2]); - -void -skc_interop_blit(GLFWwindow * window); - -// -// -// diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c index 8833b0bb1c..8261f4bdf8 100644 --- a/src/compute/skc/main.c +++ b/src/compute/skc/main.c @@ -30,7 +30,7 @@ // #include -#include "interop.h" +#include "platforms/cl_12/gl/interop.h" // // @@ -49,7 +49,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context); // // -static +static void is_render_complete(skc_surface_t surface, skc_styling_t styling, @@ -67,9 +67,9 @@ int main(int argc, char** argv) { // + // // - // - if (argc <= 1) + if (argc <= 1) { fprintf(stderr,"-- missing filename\n"); return EXIT_FAILURE; // no filename @@ -110,7 +110,7 @@ main(int argc, char** argv) CL_WGL_HDC_KHR, (cl_context_properties)hDC, 0 }; - + // // create context // @@ -136,14 +136,14 @@ main(int argc, char** argv) skc_raster_builder_t raster_builder; err = skc_raster_builder_create(context,&raster_builder); - + // // create a composition // skc_composition_t composition; err = skc_composition_create(context,&composition); - + // // create a styling instance // @@ -154,7 +154,7 @@ main(int argc, char** argv) svg_doc_layer_count(svg_doc), 1000, 2 * 1024 * 1024); - + // // create a surface // @@ -191,7 +191,7 @@ main(int argc, char** argv) skc_transform_stack_restore(ts,ts_save); // decode layers -- places rasters - svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); + svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); // seal the composition skc_composition_seal(composition); @@ -244,7 +244,7 @@ main(int argc, char** argv) // unseal the composition skc_composition_unseal(composition,true); } - + // // dispose of mundane resources // diff --git a/src/compute/skc/make_all.bat b/src/compute/skc/make_all.bat deleted file mode 100644 index 4772cc73b4..0000000000 --- a/src/compute/skc/make_all.bat +++ /dev/null @@ -1,15 +0,0 @@ -@ECHO OFF - -CMD /C make_inl_cl.bat block_pool_init.cl -CMD /C make_inl_cl.bat fills_expand.cl -CMD /C make_inl_cl.bat paths_copy.cl -CMD /C make_inl_cl.bat rasterize.cl -CMD /C make_inl_cl.bat segment_ttrk.cl -CMD /C make_inl_cl.bat rasters_alloc.cl -CMD /C make_inl_cl.bat prefix.cl -CMD /C make_inl_cl.bat place.cl -CMD /C make_inl_cl.bat segment_ttck.cl -CMD /C make_inl_cl.bat render.cl -CMD /C make_inl_cl.bat paths_reclaim.cl -CMD /C make_inl_cl.bat rasters_reclaim.cl - diff --git a/src/compute/skc/make_inl_cl.bat b/src/compute/skc/make_inl_cl.bat deleted file mode 100644 index 777a5f3bc2..0000000000 --- a/src/compute/skc/make_inl_cl.bat +++ /dev/null @@ -1,72 +0,0 @@ -@ECHO OFF - -SET OPENCL_STD=-cl-std=CL1.2 -SET OPENCL_PRE=__OPENCL_C_VERSION__=120 - -:: OPENCL_STD=-cl-std=CL2.0 -:: OPENCL_PRE=__OPENCL_C_VERSION__=200 - -:: -:: -:: - -SET IOC=ioc64 - -:: -:: -:: - -SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info - -SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g - -SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% - -:: -:: -:: - -SET PRE_DIR=%~p1 - -CD %PRE_DIR% - -SET PRE_CL=%~n1 -SET PRE_CL=%PRE_CL%.pre.cl - -SET PRE_SRC_INL=%~n1 -SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl - -SET PRE_BIN_IR=%~n1 -SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir - -SET PRE_BIN_INL=%~n1 -SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl - -:: -:: *.pre.cl -:: *.pre.src.inl -:: - -CMD /C cl -I . -I .. -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" -CMD /C clang-format -style=Mozilla -i %PRE_CL% -CMD /C dos2unix -q %PRE_CL% -CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% - -echo %PRE_CL% -echo %PRE_SRC_INL% - -:: -:: *.pre.cl -:: *.pre.src.inl -:: - -CMD /C touch %PRE_BIN_IR% -ECHO ON -@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% -@ECHO OFF -CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% - -echo %PRE_BIN_IR% -echo %PRE_BIN_INL% - - diff --git a/src/compute/skc/path_builder_cl_12.c b/src/compute/skc/path_builder_cl_12.c deleted file mode 100644 index e915dffada..0000000000 --- a/src/compute/skc/path_builder_cl_12.c +++ /dev/null @@ -1,1443 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include -#include -#include -#include - -#include "common/cl/assert_cl.h" - -#include "context.h" -#include "handle.h" -#include "grid.h" -#include "path.h" -#include "path_builder.h" - -#include "config_cl.h" -#include "export_cl_12.h" -#include "runtime_cl_12.h" -#include "path_builder_cl_12.h" - -// -// OpenCL 1.2 devices support mapping of buffers into the host address -// space. -// -// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit -// boundary (e.g. 128 bytes). This complicates coordinating sharing -// of data between the host and the device. -// -// Some OpenCL 2.0 devices support fine-grained shared virtual memory -// pointers with byte-addressing and allow simpler coordination -// strategies at the cost of maintaining cache coherency. -// -// The path builder is focused on moving bulk path data from the host -// into the device-managed "block" memory pool and arranging it into a -// SIMT/SIMD-friendly data structure that can be efficiently read by -// the rasterizer. -// -// Note that one simplifying assumption is that the maximum length of -// a *single* path can't be larger than what fits in the single extent -// (which is split into M subbuffers). This would be a very long path -// and a legitimate size limitation. -// -// For some systems, it may be appropriate to never pull path data -// into the device-managed block pool and instead present the path -// data to the device in a temporarily available allocated memory -// "zone" of paths that can be discarded all at once. -// -// For other systems, it may be appropriate to simply copy the path -// data from host to device. -// -// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be -// targeting support basic map/unmap functionality similar to OpenCL -// 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained -// sharing of memory and still require a map/unmap step... but note -// that they all support byte-aligned mapping and subbuffers. -// -// The general strategy that this particular CL_12 implementation uses -// is to allocate a large mappable bulk-data path buffer and an -// auxilary mappable command buffer. -// -// The buffers are split into a reasonable number of properly aligned -// subbuffers to enable simultaneous host and device access. -// - -// -// Blocks: -// 1 extent -// M mapped subbuffers (configurable) to allow for concurrency -// -// Commands: -// 1 extent -// M mapped subbuffers (configurable) to allow for concurrency -// -// Spans: -// M hi/lo structures -// -// { cl_sub, void*, event, base } -// -// - size of sub buffer -// - remaining -// -// - counts -// - -// -// For any kernel launch, at most one path will be discontiguous and -// defined across two sub-buffers. -// -// Nodes are updated locally until full and then stored so they will -// never be incomplete. Headers are stored locally until the path is -// ended so they will never be incomplete. -// -// A line, quad or cubic acquires 4/6/8 segments which may be spread -// across one or more congtiguous blocks. -// -// If a flush() occurs then the remaining columns of multi-segment -// paths are initialized with zero-length line, quad, cubic elements. -// -// Every block's command word has a type and a count acquired from a -// rolling counter. -// -// The kernel is passed two spans of blocks { base, count } to -// process. The grid is must process (lo.count + hi.count) blocks. -// - -struct skc_subbuffer_blocks -{ - cl_mem device; - void * host; -}; - -struct skc_subbuffer_cmds -{ - cl_mem device; - void * host; - cl_event map; -}; - -// -// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer ) -// - -typedef skc_uint skc_ringdex_t; - -union skc_ringdex_expand -{ - div_t qr; - - struct { -#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0 - skc_uint subbuf; - skc_uint block; -#else - skc_uint block; - skc_uint subbuf; -#endif - }; -}; - -// -// this record is executed by the grid -// - -struct skc_release_record -{ - struct skc_path_builder_impl * impl; // back pointer to impl - - skc_grid_t grid; // pointer to scheduled grid - - skc_uint from; // inclusive starting index : [from,to) - skc_uint to; // non-inclusive ending index : [from,to) -}; - -// -// -// - -struct skc_path_builder_impl -{ - struct skc_path_builder * path_builder; - - struct skc_runtime * runtime; - - cl_command_queue cq; - - struct { - cl_kernel alloc; - cl_kernel copy; - } kernels; - - // - // FIXME -- make this pointer to constant config - // - // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - struct { - skc_uint subbufs; // how many subbufs in the buffer? - - struct { - skc_uint buffer; // how many blocks in the buffer? - skc_uint subbuf; // how many blocks in a subbuf? - } blocks_per; - } ring; - // - // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^ - // - - struct { - cl_mem buffer; // backing buffer for blocks - struct skc_subbuffer_blocks * subbufs; // array of structures - } blocks; - - struct { - cl_mem buffer; // backing buffer for commands - struct skc_subbuffer_cmds * subbufs; // array of structures - } cmds; - - struct { - struct skc_release_record * records; // max release records is equal to max subbufs - skc_path_t * paths; // max paths is less than or equal to max commands - } release; - - cl_mem reads; // each kernel only requires one word to store the block pool "base" - - struct { - skc_uint rolling; // rolling counter used by cmds to map to block pool alloc - skc_ringdex_t from; - skc_ringdex_t to; - } prev; - - struct { - skc_ringdex_t from; - skc_ringdex_t to; - } curr; - - struct { - struct skc_path_head * head; // pointer to local path header -- not written until path end - struct skc_path_node * node; // pointer to local node -- may alias head until head is full - - struct { - skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated - union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids - skc_uint rem; // how many id slots left in node block - } ids; - - struct { - skc_uint rem; // how many subblocks left in block? - skc_uint rolling; // rolling counter of block of subblocks - float * next; // next subblock in current subblock block - skc_uint idx; // index of next subblock - } subblocks; - - struct { - skc_uint one; // .block = 1 - skc_uint next; // rolling counter used by cmds to map to block pool alloc - } rolling; - - skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current - } wip; -}; - -// -// FIXME -- move to a pow2 subbuffer size and dispense with division -// and modulo operations -// - -static -union skc_ringdex_expand -skc_ringdex_expand(struct skc_path_builder_impl * const impl, - skc_ringdex_t const ringdex) -{ - return (union skc_ringdex_expand){ - .qr = div(ringdex,impl->ring.blocks_per.subbuf) - }; -} - -static -void -skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl) -{ - // - // FIXME - which is faster? - // -#if 1 - impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer; -#else - impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to; -#endif - - // this path is too long -- for now assert() and die - assert(impl->wip.to != impl->curr.from); -} - -static -skc_ringdex_t -skc_ringdex_span(struct skc_path_builder_impl * const impl, - skc_ringdex_t const from, - skc_ringdex_t const to) -{ - return (to - from) % impl->ring.blocks_per.buffer; -} - -static -void -skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl) -{ - union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); - - // nothing to do if this is the first block in the subbuf - if (to.block == 0) - return; - - skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs; - - // otherwise increment and mod - impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf; -} - -static -skc_bool -skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl) -{ - return impl->curr.from == impl->curr.to; -} - -static -skc_bool -skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl) -{ - return impl->prev.from == impl->prev.to; -} - -static -skc_uint -skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, - skc_uint const to_block) -{ - // no blocks acquired OR this is last block in subbuf - return !((impl->wip.to == impl->curr.to) || (to_block == 0)); -} - -// -// -// - -static -struct skc_release_record * -skc_release_curr(struct skc_path_builder_impl * const impl) -{ - union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); - - return impl->release.records + curr_from.subbuf; -} - -// -// FIXME -- get rid of all distant config references -- grab them at all at creation time -// - -static -void -skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl) -{ - // init header counters // { handle, blocks, nodes, prims } - impl->wip.head->header = (union skc_path_header){ - .handle = 0, - .blocks = 0, - .nodes = 0, - .prims = 0 - }; - - // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS - impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN }; - - // point wip ids at local head node - impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node - impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere - - // start with no subblocks - impl->wip.subblocks.rem = 0; -} - -// -// -// - -static -void -skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl) -{ -#if 1 - // - // FIXME -- a Duff's device might be optimal here but would have to - // be customized per device since node's could be 16-128+ words - // - while (impl->wip.ids.rem > 0) - { - impl->wip.ids.rem -= 1; - impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID; - impl->wip.ids.next += 1; - } -#else - memset(&impl->wip.ids.next->u32, - SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF - sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem); - - impl->wip.ids.next += impl->wip.ids.rem; - impl->wip.ids.rem = 0; -#endif -} - -// -// -// - -static -void -skc_zero_float(skc_float * p, skc_uint rem) -{ - memset(p,0,sizeof(*p)*rem); -} - -static -void -skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder) -{ - // - // FIXME -- it might be more performant to zero the remaining - // columns in a subblock -- a subblock at a time -- instead of the - // same column across all the subblocks - // -#if 0 - while (path_builder->line.rem > 0) - { - --path_builder->line.rem; - - *path_builder->line.coords[0]++ = 0.0f; - *path_builder->line.coords[1]++ = 0.0f; - *path_builder->line.coords[2]++ = 0.0f; - *path_builder->line.coords[3]++ = 0.0f; - } - - while (path_builder->quad.rem > 0) - { - --path_builder->quad.rem; - - *path_builder->line.coords[0]++ = 0.0f; - *path_builder->line.coords[1]++ = 0.0f; - *path_builder->line.coords[2]++ = 0.0f; - *path_builder->line.coords[3]++ = 0.0f; - *path_builder->line.coords[4]++ = 0.0f; - *path_builder->line.coords[5]++ = 0.0f; - } - - while (path_builder->cubic.rem > 0) - { - --path_builder->cubic.rem; - - *path_builder->line.coords[0]++ = 0.0f; - *path_builder->line.coords[1]++ = 0.0f; - *path_builder->line.coords[2]++ = 0.0f; - *path_builder->line.coords[3]++ = 0.0f; - *path_builder->line.coords[4]++ = 0.0f; - *path_builder->line.coords[5]++ = 0.0f; - *path_builder->line.coords[6]++ = 0.0f; - *path_builder->line.coords[7]++ = 0.0f; - } -#else - if (path_builder->line.rem > 0) - { - skc_zero_float(path_builder->line.coords[0],path_builder->line.rem); - skc_zero_float(path_builder->line.coords[1],path_builder->line.rem); - skc_zero_float(path_builder->line.coords[2],path_builder->line.rem); - skc_zero_float(path_builder->line.coords[3],path_builder->line.rem); - - path_builder->line.rem = 0; - } - - if (path_builder->quad.rem > 0) - { - skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem); - skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem); - skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem); - skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem); - skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem); - skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem); - - path_builder->quad.rem = 0; - } - - if (path_builder->cubic.rem > 0) - { - skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem); - skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem); - - path_builder->cubic.rem = 0; - } -#endif -} - -// -// -// - -static -void -skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl, - skc_uint from, - skc_uint to) -{ - // to might be out of range - to = to % impl->ring.subbufs; - -#if 0 - fprintf(stderr,"unmap: [%2u,%2u)\n",from,to); -#endif - - while (from != to) // 'to' might be out of range - { - // bring 'from' back in range - from = from % impl->ring.subbufs; - - struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; - struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; - - cl(EnqueueUnmapMemObject(impl->cq, - blocks->device, - blocks->host, - 0,NULL,NULL)); - - cl(EnqueueUnmapMemObject(impl->cq, - cmds->device, - cmds->host, - 0,NULL,NULL)); - - // bring from back in range - from = ++from % impl->ring.subbufs; - } -} - -// -// FIXME -- reuse this in create() -// - -static -void -skc_path_builder_impl_map(struct skc_path_builder_impl * const impl, - skc_uint from, - skc_uint to) -{ - // to might be out of range - to = to % impl->ring.subbufs; - -#if 0 - fprintf(stderr," map: [%2u,%2u)\n",from,to); -#endif - - while (from != to) - { - cl_int cl_err; - - struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; - struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; - - blocks->host = clEnqueueMapBuffer(impl->cq, - blocks->device, - CL_FALSE, - CL_MAP_WRITE_INVALIDATE_REGION, - 0,impl->runtime->config->paths_copy.block.subbuf, - 0,NULL,NULL, - &cl_err); cl_ok(cl_err); - - cl(ReleaseEvent(cmds->map)); - - cmds->host = clEnqueueMapBuffer(impl->cq, - cmds->device, - CL_FALSE, - CL_MAP_WRITE_INVALIDATE_REGION, - 0,impl->runtime->config->paths_copy.command.subbuf, - 0,NULL,&cmds->map, - &cl_err); cl_ok(cl_err); - - // bring from back in range - from = ++from % impl->ring.subbufs; - } - // - // FIXME -- when we switch to out of order queues we'll need a barrier here - // -} - -// -// -// - -static -void -skc_path_builder_release_dispose(struct skc_release_record * const release, - struct skc_path_builder_impl * const impl) -{ - struct skc_runtime * runtime = impl->runtime; - - if (release->from <= release->to) // no wrap - { - skc_path_t const * paths = impl->release.paths + release->from; - skc_uint count = release->to - release->from; - - skc_grid_deps_unmap(runtime->deps,paths,count); - skc_runtime_path_device_release(runtime,paths,count); - } - else // from > to implies wrap - { - skc_path_t const * paths_lo = impl->release.paths + release->from; - skc_uint count_lo = impl->ring.blocks_per.buffer - release->from; - - skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo); - skc_runtime_path_device_release(runtime,paths_lo,count_lo); - - skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to); - skc_runtime_path_device_release(runtime,impl->release.paths,release->to); - } - - release->to = release->from; -} - -static -void -skc_path_builder_grid_pfn_dispose(skc_grid_t const grid) -{ - struct skc_release_record * const release = skc_grid_get_data(grid); - struct skc_path_builder_impl * const impl = release->impl; - - skc_path_builder_release_dispose(release,impl); -} - -static -void -// skc_path_builder_complete(struct skc_release_record * const release) -skc_path_builder_complete(skc_grid_t grid) -{ - // - // notify deps that this grid is complete enough for other grids to - // proceed - // - // the path builder still has some cleanup to do before all its - // resources can be reused - // - skc_grid_complete(grid); -} - -static -void -skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid) -{ - SKC_CL_CB(status); - - struct skc_release_record * const release = skc_grid_get_data(grid); - - SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid); -} - -// -// -// - -static -void -skc_path_builder_grid_pfn_waiting(skc_grid_t const grid) -{ - struct skc_release_record * const release = skc_grid_get_data(grid); - struct skc_path_builder_impl * const impl = release->impl; - - // 1. flush incomplete subblocks of path elements - // 2. unmap subbuffer on cq.unmap - // 3. flush cq.unmap - // 4. launch kernel on cq.kernel but wait for unmap completion - // 5. flush cq.kernel - // 6. remap relevant subbuffers on cq.map but wait for kernel completion - // 7. flush cq.map - - // - // FIXME -- can be smarter about flushing if the wip paths are not - // in the same subbuf as curr.to - // - // THIS IS IMPORTANT TO FIX - // - - // flush incomplete subblocks - skc_path_builder_finalize_subblocks(impl->path_builder); - - // - // get range of subbufs that need to be unmapped - // - // note that impl->prev subbufs have already been unmapped - // - union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); - union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to); - skc_uint const is_partial = curr_to.block > 0; - skc_uint const unmap_to = curr_to.subbuf + is_partial; - - // - // unmap all subbufs in range [from,to) - // - skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to); - - // - // launch kernels - // - skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to); - skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to); - skc_uint const pb_cmds = pb_prev_span + pb_curr_span; - - // - // 1) allocate blocks from pool - // - - // - // FIXME -- pack integers into struct/vector - // - cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw))); - cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads))); - cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf))); - cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds))); - - skc_device_enqueue_kernel(impl->runtime->device, - SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, - impl->cq, - impl->kernels.alloc, - 1, - 0,NULL,NULL); - - // - // 2) copy blocks from unmapped device-accessible memory - // - - // - // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7 - // - cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw))); - - cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw))); - cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); - cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask))); - - cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads))); - cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf))); - - cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer))); - cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer))); - - cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer))); - cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling))); - - cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from))); - cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span))); - cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from))); - - cl_event complete; - - skc_device_enqueue_kernel(impl->runtime->device, - SKC_DEVICE_KERNEL_ID_PATHS_COPY, - impl->cq, - impl->kernels.copy, - pb_cmds, - 0,NULL,&complete); - - // set a callback on completion - cl(SetEventCallback(complete,CL_COMPLETE, - skc_path_builder_paths_copy_cb, - grid)); - - // immediately release - cl(ReleaseEvent(complete)); - - // - // remap as many subbuffers as possible after the kernel completes - // - // note that remaps are async and enqueued on the same command queue - // as the kernel launch - // - // we can't remap subbuffers that are in the possibly empty range - // - // cases: - // - // - curr.to == wip.to which means no blocks have been acquired - // - curr.to points to first block in (next) subbuf - // - otherwise, wip acquired blocks in the curr.to subbuf - // - // check for these first 2 cases! - // - union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from); - skc_uint const no_wip = impl->curr.to == impl->wip.to; - skc_uint map_to = curr_to.subbuf + (is_partial && no_wip); - - // remap all subbufs in range [from,to) - skc_path_builder_impl_map(impl,prev_from.subbuf,map_to); - - // flush command queue - cl(Flush(impl->cq)); - - // save rolling - impl->prev.rolling = impl->wip.rolling.next; - - // update prev and curr - if (no_wip) - { - // - // if there was no wip then round up to the next subbuf - // - skc_ringdex_wip_to_subbuf_inc(impl); - - // - // update prev/curr with with incremented wip - // - impl->prev.from = impl->prev.to = impl->wip.to; - impl->curr.from = impl->curr.to = impl->wip.to; - } - else - { - // - // update prev with wip partials - // - impl->prev.from = impl->curr.to; - impl->prev.to = impl->wip .to; - - // - // start curr on a new subbuf boundary - // - skc_ringdex_wip_to_subbuf_inc(impl); - - impl->curr.from = impl->wip.to; - impl->curr.to = impl->wip.to; - } -} - -// -// -// - -static -void -skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl, - skc_uint const subbuf) -{ - // - // FIXME -- move to a power-of-two subbuf size and kickstart path - // copies as early as possible - // - // FIXME -- the subbufs "self-clock" (flow control) the kernel - // launches and accounting. Combine all the subbuffers and release - // records into a single indexable struct instead of 3. - // - struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf; - struct skc_release_record * const release = impl->release.records + subbuf; - struct skc_scheduler * const scheduler = impl->runtime->scheduler; - - // can't proceed until the paths have been released - SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to); - - // throw in a scheduler yield ... FIXME -- get rid of - skc_scheduler_yield(scheduler); - - // can't proceed until the subbuffer is mapped - cl(WaitForEvents(1,&sc->map)); -} - -// -// -// - -static -union skc_ringdex_expand -skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl) -{ - // break ringdex into components - union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); - - // does wip ringdex point to a new subbuffer? - if (to.block == 0) - { - // potentially spin/block waiting for subbuffer - skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf); - } - - // post increment wip.to - skc_ringdex_wip_to_block_inc(impl); - - return to; -} - -// -// -// - -static -skc_uint -skc_rolling_block(skc_uint const rolling, skc_uint const tag) -{ - return rolling | tag; -} - -static -skc_uint -skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag) -{ - return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag; -} - -static -void -skc_rolling_inc(struct skc_path_builder_impl * const impl) -{ - impl->wip.rolling.next += impl->wip.rolling.one; -} - -// -// -// - -static -void * -skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl, - skc_uint const rolling, - skc_cmd_paths_copy_tag const tag) -{ - // bump blocks count - impl->wip.head->header.blocks += 1; - - // acquire a block - union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl); - - // make a pointer - union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host; - - // store command for block - cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag); - -#if 0 - // store command for block - cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag); - - // increment rolling - skc_rolling_inc(impl); -#endif - - // return pointer to block - float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host; - - // FIXME -- make it easier to get config constant - return blocks_subbuf + (to.block * impl->runtime->config->block.words); -} - -// -// -// - -static -void -skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl) -{ - // store command to subbuf and get pointer to blocks subbuf - void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling, - SKC_CMD_PATHS_COPY_TAG_NODE); - - // copy head to blocks subbuf -- write-only - memcpy(block,impl->wip.node,impl->runtime->config->block.bytes); -} - -static -void -skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl) -{ - // store command to subbuf and get pointer to blocks subbuf - void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, - SKC_CMD_PATHS_COPY_TAG_HEAD); - - // copy head to blocks subbuf -- write-only - memcpy(block,impl->wip.head,impl->runtime->config->block.bytes); - - // increment rolling - skc_rolling_inc(impl); - - // the 'to' index is non-inclusive so assign wip.to after flush_head - impl->curr.to = impl->wip.to; -} - -// -// -// - -static -void -skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl) -{ - // update final block id in node - impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT); - - // if wip.ids is not the header then flush now full wip node - if (impl->wip.head->header.nodes > 0) - skc_path_builder_impl_flush_node(impl); - - // bump node count - impl->wip.head->header.nodes += 1; - - // save current rolling - impl->wip.ids.rolling = impl->wip.rolling.next; - - // increment rolling - skc_rolling_inc(impl); - - // update wip.ids.* - impl->wip.ids.next = impl->wip.node->tag_ids; - impl->wip.ids.rem = impl->runtime->config->block.words; -} - -static -void -skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl) -{ - impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure - impl->wip.subblocks.rolling = impl->wip.rolling.next; - impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, - SKC_CMD_PATHS_COPY_TAG_SEGS); - impl->wip.subblocks.idx = 0; - - // increment rolling - skc_rolling_inc(impl); -} - -// -// -// - -static -void -skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl, - skc_block_id_tag tag, - skc_uint vertices, - float * * subblocks) -{ - // - // FIRST TAG RECORDS THE ELEMENT TYPE - // - while (true) - { - // if only one block id left in node then acquire new node block - // and append its block id as with a next tag - if (impl->wip.ids.rem == 1) - skc_path_builder_impl_new_node_block(impl); - - // if zero subblocks left then acquire a new subblock block and - // append its block id - if (impl->wip.subblocks.rem == 0) - skc_path_builder_impl_new_segs_block(impl); - - // save first command -- tag and subblocks may have been updated - impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag); - - // increment node block subblock pointer - impl->wip.ids.next += 1; - impl->wip.ids.rem -= 1; - - // how many vertices can we store - skc_uint rem = min(vertices,impl->wip.subblocks.rem); - - // decrement vertices - vertices -= rem; - impl->wip.subblocks.rem -= rem; - impl->wip.subblocks.idx += rem; - - // assign subblocks - do { - *subblocks++ = impl->wip.subblocks.next; - impl->wip.subblocks.next += impl->runtime->config->subblock.words; - // FIXME -- move constants closer to structure - } while (--rem > 0); - - // anything left to do? - if (vertices == 0) - break; - - // any tag after this will be a caboose command - tag = SKC_BLOCK_ID_TAG_PATH_NEXT; - } -} - -// -// -// - -static -void -skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path) -{ - // finalize incomplete active subblocks -- we don't care about any - // remaining unused subblocks in block - skc_path_builder_finalize_subblocks(impl->path_builder); - - // mark remaining wips.ids in the head or node as invalid - skc_path_builder_impl_finalize_node(impl); - - // flush node if rem > 0 and node is not actually head - if (impl->wip.head->header.nodes >= 1) - skc_path_builder_impl_flush_node(impl); - - // acquire path host id - *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN - - // save path host handle - impl->wip.head->header.handle = *path; - - // flush head -- acquires a block and bumps head->header.blocks - skc_path_builder_impl_flush_head(impl); - - // get current release - struct skc_release_record * const release = skc_release_curr(impl); - - // acquire grid if null - if (release->grid == NULL) - { - release->grid = - SKC_GRID_DEPS_ATTACH(impl->runtime->deps, - &release->grid, // NULL on start/force - release, // data payload - skc_path_builder_grid_pfn_waiting, - NULL, // no execute pfn - skc_path_builder_grid_pfn_dispose); - } - - // update grid map - skc_grid_map(release->grid,*path); - - // update path release - impl->release.paths[release->to] = *path; - - // increment release.to - release->to = (release->to + 1) % impl->ring.blocks_per.buffer; - - // add guard bit - *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH; - -#if 1 - // - // eager kernel launch? - // - { - union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from); - union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to); - - if (curr_from.subbuf != curr_to.subbuf) - { - skc_grid_start(release->grid); - // skc_scheduler_yield(impl->runtime->scheduler); - } - } -#endif -} - -// -// FIXME -- clean up accessing of CONFIG constants in these 3 routines -// - -static -void -skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl) -{ - // acquire subblock pointers - skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4, - impl->path_builder->line.coords); - - // increment line count - impl->wip.head->header.prims += 1; - - // update rem_count_xxx count - impl->path_builder->line.rem = impl->runtime->config->subblock.words; -} - -static -void -skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl) -{ - // acquire subblock pointers - skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6, - impl->path_builder->quad.coords); - - // increment line count - impl->wip.head->header.prims += 1; - - // update rem_count_xxx count - impl->path_builder->quad.rem = impl->runtime->config->subblock.words; -} - -static -void -skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl) -{ - // acquire subblock pointers - skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8, - impl->path_builder->cubic.coords); - - // increment line count - impl->wip.head->header.prims += 1; - - // update rem_count_xxx count - impl->path_builder->cubic.rem = impl->runtime->config->subblock.words; -} - -// -// -// - -static -void -skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl) -{ - // decrement reference count - if (--impl->path_builder->refcount != 0) - return; - - // - // otherwise, dispose of everything - // - struct skc_runtime * const runtime = impl->runtime; - - // free path builder - skc_runtime_host_perm_free(impl->runtime,impl->path_builder); - - // release cq - skc_runtime_release_cq_in_order(runtime,impl->cq); - - // release kernels - cl(ReleaseKernel(impl->kernels.alloc)); - cl(ReleaseKernel(impl->kernels.copy)); - - // free blocks extents - cl(ReleaseMemObject(impl->blocks.buffer)); - skc_runtime_host_perm_free(runtime,impl->blocks.subbufs); - - cl(ReleaseMemObject(impl->cmds.buffer)); - skc_runtime_host_perm_free(runtime,impl->cmds.subbufs); - - // free records - skc_runtime_host_perm_free(runtime,impl->release.records); - skc_runtime_host_perm_free(runtime,impl->release.paths); - - // release staging head and node - skc_runtime_host_perm_free(runtime,impl->wip.head); - skc_runtime_host_perm_free(runtime,impl->wip.node); - - // release reads scratch array - cl(ReleaseMemObject(impl->reads)); - - // for all subbuffers - // unmap subbuffer - // release subbuffer - // printf("%s not releasing subbuffers\n",__func__); - - skc_runtime_host_perm_free(impl->runtime,impl); -} - -// -// -// - -skc_err -skc_path_builder_cl_12_create(struct skc_context * const context, - struct skc_path_builder * * const path_builder) -{ - // - // retain the context - // skc_context_retain(context); - // - struct skc_runtime * const runtime = context->runtime; - - // allocate path builder - (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder)); - - // init state - SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY); - - (*path_builder)->context = context; - - // save opaque impl-specific pointers - (*path_builder)->begin = skc_path_builder_pfn_begin; - (*path_builder)->end = skc_path_builder_pfn_end; - (*path_builder)->new_line = skc_path_builder_pfn_new_line; - (*path_builder)->new_quad = skc_path_builder_pfn_new_quad; - (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic; - (*path_builder)->release = skc_path_builder_pfn_release; - - // initialize path builder counts - (*path_builder)->line.rem = 0; - (*path_builder)->quad.rem = 0; - (*path_builder)->cubic.rem = 0; - - (*path_builder)->refcount = 1; - - struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); - - (*path_builder)->impl = impl; - - // - // init impl - // - impl->path_builder = *path_builder; - impl->runtime = runtime; - - impl->cq = skc_runtime_acquire_cq_in_order(runtime); - - impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC); - impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY); - - // - // FIXME -- let these config constants remain constant and in place - // - struct skc_config const * const config = runtime->config; - - impl->ring.subbufs = config->paths_copy.buffer.count; - impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count; - impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count; - // - // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - // - - cl_int cl_err; - - // allocate large device-side extent for path data - impl->blocks.buffer = clCreateBuffer(runtime->cl.context, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere - NULL,&cl_err); cl_ok(cl_err); - - // allocate small host-side array of pointers to mapped subbufs - impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, - impl->ring.subbufs * - sizeof(*impl->blocks.subbufs)); - - // allocate large device-side extent for path copy commands - impl->cmds.buffer = clCreateBuffer(runtime->cl.context, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - config->paths_copy.command.buffer, - NULL,&cl_err); cl_ok(cl_err); - - // allocate small host-side array of pointers to mapped subbufs - impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, - impl->ring.subbufs * - sizeof(*impl->cmds.subbufs)); - - // allocate small host-side array of intervals of path handles - impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, - impl->ring.subbufs * - sizeof(*impl->release.records)); - - // allocate large host-side array that is max # of path handles in flight - impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, - impl->ring.blocks_per.buffer * - sizeof(*impl->release.paths)); - - // small scratch used by kernels - impl->reads = clCreateBuffer(runtime->cl.context, - CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, - sizeof(skc_uint) * impl->ring.subbufs, - NULL,&cl_err); cl_ok(cl_err); - - // initialize release record with impl backpointer - for (skc_uint ii=0; iiring.subbufs; ii++) - { - struct skc_release_record * record = impl->release.records + ii; - - record->impl = impl; - record->grid = NULL; - record->from = record->to = ii * impl->ring.blocks_per.subbuf; - } - - // - // allocate and map subbuffers -- we always check the command - // subbuffer's map/unmap events before touching it or its associated - // block subbuffer. - // - struct skc_subbuffer_blocks * sb = impl->blocks.subbufs; - struct skc_subbuffer_cmds * sc = impl->cmds .subbufs; - - cl_buffer_region rb = { 0, config->paths_copy.block.subbuf }; - cl_buffer_region rc = { 0, config->paths_copy.command.subbuf }; - - // for each subbuffer - for (skc_uint ii=0; iipaths_copy.buffer.count; ii++) - { - sb->device = clCreateSubBuffer(impl->blocks.buffer, - CL_MEM_HOST_WRITE_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, - &rb, - &cl_err); cl_ok(cl_err); - - sb->host = clEnqueueMapBuffer(impl->cq, - sb->device, - CL_FALSE, - CL_MAP_WRITE_INVALIDATE_REGION, - 0,rb.size, - 0,NULL,NULL, - &cl_err); cl_ok(cl_err); - - sc->device = clCreateSubBuffer(impl->cmds.buffer, - CL_MEM_HOST_WRITE_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, - &rc, - &cl_err); cl_ok(cl_err); - - sc->host = clEnqueueMapBuffer(impl->cq, - sc->device, - CL_FALSE, - CL_MAP_WRITE_INVALIDATE_REGION, - 0,rc.size, - 0,NULL,&sc->map, - &cl_err); cl_ok(cl_err); - sb += 1; - sc += 1; - - rb.origin += rb.size; - rc.origin += rc.size; - } - - // - // initialize remaining members - // - impl->prev.from = 0; - impl->prev.to = 0; - impl->prev.rolling = 0; - - impl->curr.from = 0; - impl->curr.to = 0; - - impl->wip.to = 0; - - impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); - impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); - - impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks; - impl->wip.rolling.next = 0; - - // for now, completely initialize builder before returning - cl(Finish(impl->cq)); - - return SKC_ERR_SUCCESS; -} - -// -// -// diff --git a/src/compute/skc/path_builder_cl_12.h b/src/compute/skc/path_builder_cl_12.h deleted file mode 100644 index 20bb13cbdf..0000000000 --- a/src/compute/skc/path_builder_cl_12.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef PATH_BUILDER_CL_12_ONCE -#define PATH_BUILDER_CL_12_ONCE - -// -// -// - -#include "block.h" - -// -// A tag type that fits into the block id tag bitfield -// - -typedef enum skc_cmd_paths_copy_tag { - - SKC_CMD_PATHS_COPY_TAG_SEGS, - SKC_CMD_PATHS_COPY_TAG_NODE, - SKC_CMD_PATHS_COPY_TAG_HEAD, - - SKC_CMD_PATHS_COPY_TAG_COUNT - -} skc_cmd_paths_copy_tag; - - -SKC_STATIC_ASSERT(SKC_CMD_PATHS_COPY_TAG_COUNT <= SKC_BLOCK_ID_TAG_COUNT); - -// -// -// - -#endif - -// -// -// - diff --git a/src/compute/skc/paths_copy.cl b/src/compute/skc/paths_copy.cl deleted file mode 100644 index 06cc393c75..0000000000 --- a/src/compute/skc/paths_copy.cl +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "path_builder_cl_12.h" -#include "path.h" -#include "block_pool_cl.h" - -// -// -// - -#if 0 - -// -// SIMD AVX2 -// - -#define SKC_PATHS_COPY_WORDS_PER_ELEM 8 -#define SKC_PATHS_COPY_SUBGROUP_SIZE 1 -#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES - -typedef skc_uint8 skc_paths_copy_elem; -typedef skc_uint8 skc_pb_idx_v; - -#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() - -#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) - -#endif - -// -// -// - -#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) -#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) -#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) -#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) - -// FIXME -- use SUBGROUP terminology everywhere -#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) - -// -// -// - -#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ - (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) - -#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ - (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) - -// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) - -// -// -// - -// -// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL -// - -#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) - -#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) - -#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) - -// -// -// - -skc_uint -skc_sub_group_local_id() -{ -#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 - return get_sub_group_local_id(); -#else - return 0; -#endif -} - -// -// convert an atomic read counter offset to a block id -// - -skc_block_id_t -skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, - skc_uint const bp_idx_mask, - skc_uint const bp_reads, - skc_uint const bp_off) -{ - skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; - - return bp_ids[bp_idx]; -} - -// -// -// - -void -skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to - skc_uint const bp_elems_idx, - __global skc_paths_copy_elem const * const pb_elems, // from - skc_uint const pb_elems_idx) -{ - for (skc_uint ii=0; ii\n",ii,bp_idx,b,elem C); - - SKC_PATHS_COPY_ELEM_EXPAND(); - - // store the elem back - (bp_elems+bp_elems_idx)[ii] = elem; - } -} - -// -// -// - -void -skc_host_map_update(__global skc_uint * const host_map, - skc_uint const block, - skc_paths_copy_elem const elem) -{ - // - // write first elem to map -- FIXME -- this is a little nasty - // because it relies on the the host handle always being the first - // word in the path header. - // - // OTOH, this is not unreasonable. The alternative is to have a - // separate kernel initializing the map. - // -#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 - if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) -#endif - { -#if SKC_PATHS_COPY_ELEM_WORDS == 1 - host_map[elem] = block; -#if 0 - printf("[%u] = %u\n",elem,block); -#endif -#else - host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; -#endif - } -} - -// -// -// - -void -skc_copy_head(__global skc_uint * const host_map, - skc_uint const block, - __global skc_paths_copy_elem * const bp_elems, // to - skc_uint const bp_elems_idx, - __global skc_block_id_t const * const bp_ids, - skc_uint const bp_reads, - skc_uint const bp_idx_mask, - __global skc_paths_copy_elem const * const pb_elems, // from - skc_uint const pb_elems_idx, - skc_uint const pb_rolling) -{ - // - // if there are more path header words than there are - // threads-per-block then we can just copy the initial header words - // -#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) - for (skc_uint ii=0; ii= pb_size) - pb_idx -= pb_size; - - // broadcast load the command - union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; - - // what do we want pb_elems do with this block? - skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); - - // compute offset from rolling base to get index into block pool ring allocation - skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); - - // convert the pb_cmd's offset counter pb_elems a block id - skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); - printf("< %8u >\n",block); - } -#endif - - // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() - skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; - - // calculate bp_elems (to) / pb_elems (from) - skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; - skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; - - if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) - { -#if 0 - if (tid == 0) - printf("%3u, segs\n",bp_off); -#endif - skc_copy_segs(bp_elems, - bp_elems_idx, - pb_elems, - pb_elems_idx); - } - else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) - { -#if 0 - if (tid == 0) - printf("%3u, NODE\n",bp_off); -#endif - skc_copy_node(bp_elems, // to - bp_elems_idx, - bp_ids, - bp_reads, - bp_idx_mask, - pb_elems, // from - pb_elems_idx, - pb_rolling); - } - else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) - { -#if 0 - if (tid == 0) - printf("%3u, HEAD\n",bp_off); -#endif - skc_copy_head(host_map, - block, - bp_elems, // to - bp_elems_idx, - bp_ids, - bp_reads, - bp_idx_mask, - pb_elems, // from - pb_elems_idx, - pb_rolling); - } -} - -// -// -// - -__kernel -SKC_PATHS_ALLOC_KERNEL_ATTRIBS -void -skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, - __global skc_uint * const bp_alloc, - skc_uint const bp_alloc_idx, - skc_uint const pb_cmd_count) -{ - // - // allocate blocks in block pool - // - skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); - - // store in slot - bp_alloc[bp_alloc_idx] = reads; - -#if 0 - printf("pc: %8u + %u\n",reads,pb_cmd_count); -#endif -} - -// -// -// diff --git a/src/compute/skc/paths_reclaim.cl b/src/compute/skc/paths_reclaim.cl deleted file mode 100644 index 563160613c..0000000000 --- a/src/compute/skc/paths_reclaim.cl +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// FIXME -- a pre-allocation step could load the path header quads and -// total up the number of blocks in the workgroup or subgroup -// minimizing the number of later atomics adds. -// - -#include "device_cl_12_gen9.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "block.h" -#include "path.h" -#include "common.h" - -// -// -// - -#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS) - -#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS) - -// -// -// - -#if ( SKC_PATHS_RECLAIM_X == 1 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_PATHS_RECLAIM_X == 2 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_PATHS_RECLAIM_X == 4 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_PATHS_RECLAIM_X == 8 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_PATHS_RECLAIM_X == 16) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_PATHS_RECLAIM_X" -#endif - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// COMPILE-TIME PREDICATES -// - -#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \ - SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I) - -#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \ - SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I) - -// -// RUN-TIME PREDICATES -// - -#define SKC_PATHS_RECLAIM_IS_HEADER(I) \ - (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS) - -// -// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL -// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK -// COMBOS (NOT NECESSARILY POW2) -// -// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR -// UINT TYPE INSTEAD OF A ULONG. -// - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 -#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint - -// -// -// - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ - (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ - ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ - S = sub_group_scan_exclusive_add(C) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \ - (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK) - -// -// -// - -struct skc_reclaim -{ - skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE]; -}; - -__kernel -SKC_PATHS_RECLAIM_KERNEL_ATTRIBS -void -skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring - __global skc_uint * const bp_elems, // block pool blocks - __global skc_uint volatile * const bp_atomics, // read/write atomics - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const map, // path host-to-device map - struct skc_reclaim const reclaim) // array of host path ids -{ -#if (__OPENCL_VERSION__ < 200) - skc_uint const reclaim_stride = get_num_sub_groups(); -#else - skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); - -#if 0 - // - // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT - // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL - // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE - // RECLAMATION JOB ON THE REST OF THE PIPELINE. - // - for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) -#endif - { - // get host path id - skc_path_h const path = reclaim.aN[reclaim_idx]; - - // get the path header block from the map - skc_block_id_t id = map[path]; - - // - // blindly load all of the head elements into registers - // - skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - skc_uint count_blocks, count_nodes; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ - count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ - } \ - if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ - count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes); - } -#endif - - // - // acquire a span in the block pool ids ring for reclaimed ids - // - // FIXME count_blocks and atomic add can be done in same lane - // - skc_uint bp_ids_base = 0; - - if (get_sub_group_local_id() == 0) { - bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); - -#if 0 - printf("paths: bp_ids_base = %u\n",bp_ids_base); -#endif - } - - bp_ids_base = sub_group_broadcast(bp_ids_base,0); - - // - // shift away the tagged block id's tag - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; - } - - // - // - we'll skip subgroups that are entirely header - // - // - but we need to mark any header elements that partially fill - // a subgroup as invalid tagged block ids - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \ - if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \ - h##I = SKC_TAGGED_BLOCK_ID_INVALID; \ - } \ - } \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - { - // - // count reclaimable blocks in each lane - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = h##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // printf("P %7u ! %u\n",bp_ids_idx,h##I); - } - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, walk the nodes - // - do { - // id of next block is in last lane - id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); - - // get index of each element - skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // - // blindly load all of the node elements into registers - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // shift away the tagged block id's tag - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; - } - - // - // count reclaimable blocks in each lane - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = n##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // printf("P %7u ! %u\n",bp_ids_idx,n##I); - - // any more nodes? - } while (--count_nodes > 0); - } -} - -// -// -// diff --git a/src/compute/skc/place.cl b/src/compute/skc/place.cl deleted file mode 100644 index 00f16f7843..0000000000 --- a/src/compute/skc/place.cl +++ /dev/null @@ -1,871 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "common.h" -#include "atomic_cl.h" -#include "raster.h" -#include "tile.h" - -// -// -// - -#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) -#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) - -// -// -// - -#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK - -// -// -// - -#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) - -// -// -// - -#if ( SKC_PLACE_X == 1 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_1() -#define SKC_PLACE_EXPAND_I_LAST 0 - -#elif ( SKC_PLACE_X == 2 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_2() -#define SKC_PLACE_EXPAND_I_LAST 1 - -#elif ( SKC_PLACE_X == 4 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_4() -#define SKC_PLACE_EXPAND_I_LAST 3 - -#elif ( SKC_PLACE_X == 8 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_8() -#define SKC_PLACE_EXPAND_I_LAST 7 - -#elif ( SKC_PLACE_X == 16) -#define SKC_PLACE_EXPAND() SKC_EXPAND_16() -#define SKC_PLACE_EXPAND_I_LAST 15 -#endif - -// -// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE -// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. -// -// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE -// KERNELS USE DIFFERENT SUBGROUP SIZES. -// -// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE -// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. -// -// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER -// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY -// ONLY SUPPORT A SUBGROUP SIZE OF 16. -// - -#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) - -#define SKC_PLACE_STRIDE_H(L) (L) -#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) -#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) - -#define SKC_PLACE_STRIDE_H(L) (L) -#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) -#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask - -#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) -#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) - -#endif - -// -// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE -// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) -// - -#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) - -#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) - -#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) - -#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) - - -// -// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX -// -#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) -#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) - -// -// TTSK v2: -// -// 0 63 -// | TTSB ID | PREFIX | SPAN | X | Y | -// +---------+--------+---------+-----+-----+ -// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | -// -// -// TTPK v2: -// -// 0 63 -// | TTPB ID | PREFIX | SPAN | X | Y | -// +---------+--------+------+-----+-----+ -// | 27 | 1 (=1) | 12 | 12 | 12 | -// -// - -// -// TTCK (32-BIT COMPARE) v1: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 18 | 7 | 7 | -// -// -// TTCK (32-BIT COMPARE) v2: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 15 | 9 | 8 | -// -// -// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 27 | 1 | 1 | 18 | 9 | 8 | -// - -union skc_subgroup_smem -{ - skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE - - struct { - struct { - skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; - skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; - } lo; - - struct { - skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; - skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; - } hi; - - // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; - }; - -}; - -// -// scatter scan max -// -static -skc_int_v_t -skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, - skc_int_v_t const iss, - skc_int_v_t const ess) -{ - // - // prefix sums determine which lanes we're going to work on next - // - skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); - skc_int_v_t const scratch_idx = max(ess,0); - - // - // SIMT - // - - // - // zero the volatile smem scratchpad using vector syntax - // - smem->scratch[get_sub_group_local_id()] = ( 0 ); - - // - // store source lane at starting lane - // - if (is_scratch_store) { - smem->scratch[scratch_idx] = get_sub_group_local_id(); - } - - // - // propagate lanes to right using max scan - // - skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; - skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); - - return source; -} - -// -// -// - -static -skc_bool -skc_xk_clip(union skc_tile_clip const * const tile_clip, - skc_ttxk_t * const xk) -{ - // - // clip the sk and pk keys - // - // if fully clipped then return false - // - // alternatively -- we can expand all these keys in place - // - // alternatively -- keep sk and pk keys segregated because sk - // represents the vast majority of keys and are easier to process. - // don't mess with the fastpath! - // - return false; -} - -// -// -// - -static -skc_ttck_t -skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const sk_idx) -{ - skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 - skc_uint const hi = smem->hi.sk[sk_idx]; - - skc_ttck_t ck; - - ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id - - // FIXME -- x and y should already be clipped and shifted - skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; - skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; - - ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; - - return ck; -} - -static -skc_ttck_t -skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const pk_idx, - skc_uint const dx) -{ - skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 - skc_uint const hi = smem->hi.pk[pk_idx]; - - skc_ttck_t ck; - - ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id - - // FIXME -- x and y should already be clipped and shifted - skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; - skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; - - ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; - - return ck; -} - -// -// -// - -static -void -skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const sk) -{ - // - // Pretty sure you can never ever have an sk count equal to 0 - // - skc_uint ck_base = 0; - - // last lane performs the block pool allocation with an atomic increment - if (get_sub_group_local_id() == 0) { - ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); - } - - // broadcast base to all lanes - ck_base = sub_group_broadcast(ck_base,0); - - // convert sk keys to ck keys - for (skc_uint ii=get_sub_group_local_id(); iilo.pk[idx]; - skc_uint const hi = smem->hi.pk[idx]; - - skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; - skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; - - return (span_lo | span_hi) + 1; -} - -// -// -// - -static -void -skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const pk) -{ - // bail out if pk queue is empty - if (pk == 0) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("%u\n",pk); -#endif - - // - // FIXME -- this nested loop iterates over the queue processing a - // subgroup of 64-bit keys at a time. This is probably not the most - // efficient approach so investigate how to store and iterate over a - // wider than subgroup (node-sized) queue of keys. - // - - // round up so we work with full subgroups - skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; - skc_uint ii = 0; - - // nested loop that expands all ttpk keys -#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) - for (; ii\n",xk_idx); -#endif - - return xk_idx; -#endif -} - -// -// -// -__kernel -SKC_PLACE_KERNEL_ATTRIBS -void -skc_kernel_place(__global skc_bp_elem_t * const bp_elems, - __global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __global union skc_cmd_place const * const cmds, - __global skc_block_id_t * const map, - skc_uint4 const clip, - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) - __local union skc_subgroup_smem volatile smem[1]; -#else - __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; - __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // This is a subgroup-centric kernel - // - // Which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // - // Test the raster's translated bounds against the composition's - // tile clip - // - // There are 3 cases: - // - // - the raster is completely clipped -> return - // - the raster is partially clipped -> all keys must clipped - // - the raster is not clipped -> no keys are tested - // - // - // There are at least 4 implementations of place and we want to - // special-case them as much as possible so that, at the least, the - // fastpath remains fast. - // - // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP - // - // - implement CLIPPED + NO TRANSLATION path - // - // - implement NO CLIP + TRANSLATION path - // - // - implement CLIPPED + TRANSLATION path - // - // - // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin - // 12:12:8 integer where: - // - // 12: ttsk - // 12: ttpk - // 8: /dev/null -- clipped or invalid key - // - // Three kinds of nodes in a raster's list: - // - // - the head node - // - an internal node - // - the final node - // - -#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const cmd_idx = get_group_id(0); -#else - skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // load command - union skc_cmd_place const cmd = cmds[cmd_idx]; - - // get the raster header from the raster host id -- scalar - skc_block_id_t id = map[cmd.raster_h]; - - // - // load all of the head block ttxk keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_raster_node_elem const h##I = { \ - .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ - bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ - }; - - SKC_PLACE_EXPAND(); - - // - // load raster header counts -- we only need the "nodes" and "keys" - // words but the keys we loaded are doublewords. - // - // FIXME -- this can be made portable with compile-time macro expansion - // - skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES - skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS - - // - // - // -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ - nodes,keys, \ - I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ - h##I.u32v2.hi,h##I.u32v2.lo, \ - h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); - - SKC_PLACE_EXPAND(); -#endif - - // -#if 0 - if (get_sub_group_local_id() == 0) { - printf("place: %u / %u / %u\n",head_id,nodes,keys); - } -#endif - - { - // - // classify every key in the header - // - // keys: 0 is not a key / 1 is a key - // skpk: 0 is sk / 1 is pk - // - skc_uint bits_keys = 0; - skc_uint bits_skpk = 0; - - // - // calculate bits_keys - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ - if (idx < keys) { \ - bits_keys |= (1u << I); \ - } \ - if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ - if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ - if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ - bits_keys &= ~(1u << I); \ - } \ - } \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // blindly calculate bits_skpk - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2X : %2X\n",bits_keys,bits_skpk); -#endif - - // - // next pointer is last element of last row. save it now because - // this might be recognized as a subgroup-uniform/scalar. - // - id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); - - // - // append SK keys first - // - skc_uint const bits_sk = bits_keys & ~bits_skpk; - skc_uint sk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint is_sk = (bits_sk >> I) & 1; \ - skc_uint sk_idx = skc_ballot(&sk,is_sk); \ - if (is_sk) { \ - smem->lo.sk[sk_idx] = h##I.xk.lo; \ - smem->hi.sk[sk_idx] = h##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // append PK keys next - // - skc_uint const bits_pk = bits_keys & bits_skpk; - skc_uint pk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint is_pk = (bits_pk >> I) & 1; \ - skc_uint pk_idx = skc_ballot(&pk,is_pk); \ - if (is_pk) { \ - smem->lo.pk[pk_idx] = h##I.xk.lo; \ - smem->hi.pk[pk_idx] = h##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2u * %2u\n",sk,pk); -#endif - // - // flush the keys - // - skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); - skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); - } - - // - // we're done if there was only a head node - // - if (nodes == 0) - return; - - // - // decrement keys - // - keys -= SKC_RASTER_HEAD_COUNT_KEYS; - - // - // otherwise, append keys in trailing nodes to smem - // - while (true) - { - // - // load all of the node block ttxk keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_raster_node_elem const n##I = { \ - .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ - bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ - }; - - SKC_PLACE_EXPAND(); - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ - nodes,keys, \ - I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ - n##I.u32v2.hi,n##I.u32v2.lo, \ - n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); - - SKC_PLACE_EXPAND(); -#endif - - // - // classify every key in the header - // - // keys: 0 is not a key / 1 is a key - // skpk: 0 is sk / 1 is pk - // - skc_uint bits_keys = 0; - skc_uint bits_skpk = 0; - - // - // calculate bits_keys - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ - if (idx < keys) { \ - bits_keys |= (1u << I); \ - } \ - if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ - if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ - if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ - bits_keys &= ~(1u << I); \ - } \ - } \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // blindly calculate bits_skpk - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2X : %2X\n",bits_keys,bits_skpk); -#endif - - // - // next pointer is last element of last row. save it now because - // this might be recognized as a subgroup-uniform/scalar. - // - id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); - - // - // append SK keys first - // - skc_uint const bits_sk = bits_keys & ~bits_skpk; - skc_uint sk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint is_sk = (bits_sk >> I) & 1; \ - skc_uint sk_idx = skc_ballot(&sk,is_sk); \ - if (is_sk) { \ - smem->lo.sk[sk_idx] = n##I.xk.lo; \ - smem->hi.sk[sk_idx] = n##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // append PK keys next - // - skc_uint const bits_pk = bits_keys & bits_skpk; - skc_uint pk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint is_pk = (bits_pk >> I) & 1; \ - skc_uint pk_idx = skc_ballot(&pk,is_pk); \ - if (is_pk) { \ - smem->lo.pk[pk_idx] = n##I.xk.lo; \ - smem->hi.pk[pk_idx] = n##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2u * %2u\n",sk,pk); -#endif - // - // if total for either the sk or pk queue reaches the - // highwater mark then flush it to the extent - // - skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); - skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); - - // - // if this was the last node then we're done - // - if (--nodes == 0) - return; - - // - // otherwise decrement keys - // - keys -= SKC_RASTER_NODE_COUNT_KEYS; - } -} - -// -// -// diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c new file mode 100644 index 0000000000..aa44f36e87 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c @@ -0,0 +1,136 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "runtime_cl_12.h" +#include "config_cl.h" +#include "common/cl/assert_cl.h" + +// +// PERM +// + +cl_mem +skc_runtime_device_perm_alloc(struct skc_runtime * const runtime, + cl_mem_flags const flags, + size_t const size) +{ + cl_int cl_err; + + cl_mem mem = clCreateBuffer(runtime->cl.context, + flags, + size, + NULL, + &cl_err); cl_ok(cl_err); + return mem; +} + +void +skc_runtime_device_perm_free(struct skc_runtime * const runtime, + cl_mem const mem) +{ + cl(ReleaseMemObject(mem)); +} + +// +// TEMP +// + +cl_mem +skc_runtime_device_temp_alloc(struct skc_runtime * const runtime, + cl_mem_flags const flags, + size_t const size, + skc_subbuf_id_t * const subbuf_id, + size_t * const subbuf_size) +{ + if (size == 0) + { + *subbuf_id = (skc_subbuf_id_t)-1; + + if (subbuf_size != NULL) + *subbuf_size = 0; + + return NULL; + } + + cl_buffer_region br; + + br.origin = skc_suballocator_subbuf_alloc(&runtime->allocator.device.temp.suballocator, + runtime->scheduler, + size,subbuf_id,&br.size); + + if (subbuf_size != NULL) + *subbuf_size = br.size; + + cl_int cl_err; + + cl_mem mem = clCreateSubBuffer(runtime->allocator.device.temp.extent, + flags, + CL_BUFFER_CREATE_TYPE_REGION, + &br, + &cl_err); cl_ok(cl_err); + + return mem; +} + + +void +skc_runtime_device_temp_free(struct skc_runtime * const runtime, + cl_mem const mem, + skc_subbuf_id_t const subbuf_id) +{ + if (mem == NULL) + return; + + skc_suballocator_subbuf_free(&runtime->allocator.device.temp.suballocator,subbuf_id); + + cl(ReleaseMemObject(mem)); +} + +// +// +// + +void +skc_allocator_device_create(struct skc_runtime * const runtime) +{ + skc_suballocator_create(runtime, + &runtime->allocator.device.temp.suballocator, + "DEVICE", + runtime->config->suballocator.device.subbufs, + runtime->cl.base_align, + runtime->config->suballocator.device.size); + +#ifndef NDEBUG +#pragma message("Get rid of CL_MEM_ALLOC_HOST_PTR as soon as the sorter is installed") + cl_mem_flags const flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; +#else + cl_mem_flags const flags = CL_MEM_READ_WRITE; +#endif + + runtime->allocator.device.temp.extent = + skc_runtime_device_perm_alloc(runtime, + flags, + runtime->config->suballocator.device.size); +} + +void +skc_allocator_device_dispose(struct skc_runtime * const runtime) +{ + skc_suballocator_dispose(runtime,&runtime->allocator.device.temp.suballocator); + + skc_runtime_device_perm_free(runtime,runtime->allocator.device.temp.extent); +} + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.h b/src/compute/skc/platforms/cl_12/allocator_device_cl.h new file mode 100644 index 0000000000..67d4e41398 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.h @@ -0,0 +1,54 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +// +// +// + +#include "suballocator.h" + +// +// +// + +struct skc_allocator_device +{ +#if 0 + struct { + + } perm; +#endif + + struct { + struct skc_suballocator suballocator; + cl_mem extent; + } temp; +}; + +// +// +// + +void +skc_allocator_device_create(struct skc_runtime * const runtime); + +void +skc_allocator_device_dispose(struct skc_runtime * const runtime); + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/atomic_cl.h b/src/compute/skc/platforms/cl_12/atomic_cl.h new file mode 100644 index 0000000000..c196c36390 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/atomic_cl.h @@ -0,0 +1,72 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_ATOMIC_CL +#define SKC_ONCE_ATOMIC_CL + +// +// git cl upload is bleating about needing an #include before and #if +// so we're unneccesarily reloading the types and OpenCL header +// + +#include "types.h" + +#if (__OPENCL_C_VERSION__ <= 120 /*CL_VERSION_1_2*/) + +#define SKC_ATOMIC_UINT uint +#define SKC_ATOMIC_INT int + +#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v) atomic_add(p,v) +#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v) + +#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v) atomic_add(p,v) +#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v) + +#else // __OPENCL_C_VERSION__ > __CL_VERSION_1_2 + +// +// REMOVE THESE DEFINES ASAP -- ONLY HERE BECAUSE THE INTEL CODE +// BUILDER UTILITY DOESN'T SUPPORT CREATING AN ATOMIC TYPE BUFFER +// + +#ifdef SKC_SUPPORT_BROKEN_INTEL_CODE_BUILDER + +#define SKC_ATOMIC_UINT uint +#define SKC_ATOMIC_CAST_LOCAL(p) (__local atomic_uint volatile * restrict const)(p) +#define SKC_ATOMIC_CAST_GLOBAL(p) (__global atomic_uint volatile * restrict const)(p) + +#else + +#define SKC_ATOMIC_UINT atomic_uint +#define SKC_ATOMIC_CAST_LOCAL(p) (p) +#define SKC_ATOMIC_CAST_GLOBAL(p) (p) + +#endif + + +#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \ + v,memory_order_relaxed,memory_scope_device) +#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \ + v,memory_order_relaxed,memory_scope_sub_group) + +#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \ + v,memory_order_relaxed,memory_scope_device) +#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \ + v,memory_order_relaxed,memory_scope_sub_group) + +#endif + +// +// +// + +#endif // SKC_ONCE_ATOMIC_CL + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/block_pool_cl.h b/src/compute/skc/platforms/cl_12/block_pool_cl.h new file mode 100644 index 0000000000..c88370919e --- /dev/null +++ b/src/compute/skc/platforms/cl_12/block_pool_cl.h @@ -0,0 +1,60 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_BLOCK_POOL +#define SKC_ONCE_BLOCK_POOL + +// +// +// + +#include "types.h" + +// +// +// + +union skc_block_pool_size +{ + skc_uint3 u32v3; + + struct { + skc_uint pool_size; // number of blocks + skc_uint ring_pow2; // rounded-up pow2 of pool_size + skc_uint ring_mask; // ring_pow2 - 1 + }; +}; + +// +// +// + +union skc_block_pool_atomic +{ + skc_uint2 u32v2; + + skc_uint u32a2[2]; + + struct { + skc_uint reads; + skc_uint writes; + }; +}; + +#define SKC_BP_ATOMIC_OFFSET_READS 0 +#define SKC_BP_ATOMIC_OFFSET_WRITES 1 + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/block_pool_cl_12.h b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h new file mode 100644 index 0000000000..6fa8a39ca0 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h @@ -0,0 +1,33 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "block_pool_cl.h" +#include "extent_cl_12.h" + +// +// device side block pool +// + +struct skc_block_pool +{ + union skc_block_pool_size const * size; + + struct skc_extent_pdrw blocks; + struct skc_extent_pdrw ids; + struct skc_extent_phr_pdrw atomics; +}; + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.c b/src/compute/skc/platforms/cl_12/composition_cl_12.c new file mode 100644 index 0000000000..7853564636 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/composition_cl_12.c @@ -0,0 +1,823 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include + +#include "hs/cl/hs_cl_launcher.h" + +#include "common/cl/assert_cl.h" + +#include "composition_cl_12.h" +#include "config_cl.h" + +#include "context.h" +#include "raster.h" +#include "handle.h" + +#include "runtime_cl_12.h" + +#include "common.h" +#include "tile.h" + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +union skc_ttck +{ + skc_ulong u64; + skc_uint2 u32v2; + + struct { + skc_uint id : SKC_TTCK_LO_BITS_ID; + skc_uint prefix : SKC_TTCK_LO_BITS_PREFIX; + skc_uint escape : SKC_TTCK_LO_BITS_ESCAPE; + skc_uint layer_lo : SKC_TTCK_LO_BITS_LAYER; + skc_uint layer_hi : SKC_TTCK_HI_BITS_LAYER; + skc_uint x : SKC_TTCK_HI_BITS_X; + skc_uint y : SKC_TTCK_HI_BITS_Y; + }; + + struct { + skc_ulong na0 : SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE; + skc_ulong layer : SKC_TTCK_BITS_LAYER; + skc_ulong na1 : SKC_TTCK_HI_BITS_YX; + }; + + struct { + skc_uint na2; + skc_uint na3 : SKC_TTCK_HI_BITS_LAYER; + skc_uint yx : SKC_TTCK_HI_BITS_YX; + }; +}; + +// +// FIXME -- accept floats on host but convert to subpixel offsets +// before appending to command ring +// + +#define SKC_PLACE_CMD_TX_CONVERT(f) 0 +#define SKC_PLACE_CMD_TY_CONVERT(f) 0 + +// +// COMPOSITION PLACE +// +// This is a snapshot of the host-side command queue. +// +// Note that the composition command extent could be implemented as +// either a mapped buffer or simply copied to an ephemeral extent. +// +// This implementation may vary between compute platforms. +// + +struct skc_composition_place +{ + struct skc_composition_impl * impl; + + cl_command_queue cq; + + struct skc_extent_phw1g_tdrNs_snap cmds; + + skc_subbuf_id_t id; +}; + +// +// Forward declarations +// + +static +void +skc_composition_unseal_block(struct skc_composition_impl * const impl, + skc_bool const block); + +// +// +// + +static +void +skc_composition_pfn_release(struct skc_composition_impl * const impl) +{ + if (--impl->composition->ref_count != 0) + return; + + // + // otherwise, dispose of all resources + // + + // the unsealed state is a safe state to dispose of resources + skc_composition_unseal_block(impl,true); // block + + struct skc_runtime * const runtime = impl->runtime; + + // free host composition + skc_runtime_host_perm_free(runtime,impl->composition); + + // release the cq + skc_runtime_release_cq_in_order(runtime,impl->cq); + + // release kernels + cl(ReleaseKernel(impl->kernels.place)); + cl(ReleaseKernel(impl->kernels.segment)); + + // release extents + skc_extent_phw1g_tdrNs_free(runtime,&impl->cmds.extent); + skc_extent_phrw_free (runtime,&impl->saved.extent); + skc_extent_phr_pdrw_free (runtime,&impl->atomics); + + skc_extent_pdrw_free (runtime,&impl->keys); + skc_extent_pdrw_free (runtime,&impl->offsets); + + // free composition impl + skc_runtime_host_perm_free(runtime,impl); +} + +// +// +// + +static +void +skc_composition_place_grid_pfn_dispose(skc_grid_t const grid) +{ + struct skc_composition_place * const place = skc_grid_get_data(grid); + struct skc_composition_impl * const impl = place->impl; + struct skc_runtime * const runtime = impl->runtime; + + // release cq + skc_runtime_release_cq_in_order(runtime,place->cq); + + // unmap the snapshot (could be a copy) + skc_extent_phw1g_tdrNs_snap_free(runtime,&place->cmds); + + // release place struct + skc_runtime_host_temp_free(runtime,place,place->id); + + // release impl + skc_composition_pfn_release(impl); +} + +// +// +// + +static +void +skc_composition_place_read_complete(skc_grid_t const grid) +{ + skc_grid_complete(grid); +} + +static +void +skc_composition_place_read_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_composition_place * const place = skc_grid_get_data(grid); + struct skc_composition_impl * const impl = place->impl; + struct skc_runtime * const runtime = impl->runtime; + struct skc_scheduler * const scheduler = runtime->scheduler; + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(scheduler,skc_composition_place_read_complete,grid); +} + +static +void +skc_composition_place_grid_pfn_execute(skc_grid_t const grid) +{ + // + // FILLS EXPAND + // + // need result of cmd counts before launching RASTERIZE grids + // + // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host + // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device + // - or launch a device-wide grid that feeds itself but that's unsatisfying + // + struct skc_composition_place * const place = skc_grid_get_data(grid); + struct skc_composition_impl * const impl = place->impl; + struct skc_runtime * const runtime = impl->runtime; + + skc_uint const work_size = skc_extent_ring_snap_count(place->cmds.snap); + skc_uint4 const clip = { 0, 0, SKC_UINT_MAX, SKC_UINT_MAX }; + + // initialize kernel args + cl(SetKernelArg(impl->kernels.place,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.place,1,SKC_CL_ARG(impl->atomics.drw))); + cl(SetKernelArg(impl->kernels.place,2,SKC_CL_ARG(impl->keys.drw))); + cl(SetKernelArg(impl->kernels.place,3,SKC_CL_ARG(place->cmds.drN))); + cl(SetKernelArg(impl->kernels.place,4,SKC_CL_ARG(runtime->handle_pool.map.drw))); + cl(SetKernelArg(impl->kernels.place,5,SKC_CL_ARG(clip))); // FIXME -- convert the clip to yx0/yx1 format + cl(SetKernelArg(impl->kernels.place,6,SKC_CL_ARG(work_size))); + + // launch kernel + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_PLACE, + place->cq, + impl->kernels.place, + work_size, + 0,NULL,NULL); + // + // copy atomics back after every place launch + // + cl_event complete; + + skc_extent_phr_pdrw_read(&impl->atomics,place->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_place_read_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(place->cq)); +} + +// +// +// + +static +void +skc_composition_snap(struct skc_composition_impl * const impl) +{ + skc_composition_retain(impl->composition); + + skc_subbuf_id_t id; + + struct skc_composition_place * const place = skc_runtime_host_temp_alloc(impl->runtime, + SKC_MEM_FLAGS_READ_WRITE, + sizeof(*place),&id,NULL); + + // save the subbuf id + place->id = id; + + // save backpointer + place->impl = impl; + + // set grid data + skc_grid_set_data(impl->grids.place,place); + + // acquire command queue + place->cq = skc_runtime_acquire_cq_in_order(impl->runtime); + + // checkpoint the ring + skc_extent_ring_checkpoint(&impl->cmds.ring); + + // make a snapshot + skc_extent_phw1g_tdrNs_snap_init(impl->runtime,&impl->cmds.ring,&place->cmds); + + // unmap the snapshot (could be a copy) + skc_extent_phw1g_tdrNs_snap_alloc(impl->runtime, + &impl->cmds.extent, + &place->cmds, + place->cq, + NULL); + + skc_grid_force(impl->grids.place); +} + +// +// +// + +static +void +skc_composition_pfn_seal(struct skc_composition_impl * const impl) +{ + // return if sealing or sealed + if (impl->state >= SKC_COMPOSITION_STATE_SEALING) + return; + + struct skc_runtime * const runtime = impl->runtime; + struct skc_scheduler * const scheduler = runtime->scheduler; + + // + // otherwise, wait for UNSEALING > UNSEALED transition + // + if (impl->state == SKC_COMPOSITION_STATE_UNSEALING) + { + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED); + } + else // or we were already unsealed + { + // flush is there is work in progress + skc_uint const count = skc_extent_ring_wip_count(&impl->cmds.ring); + + if (count > 0) { + skc_composition_snap(impl); + } + } + + // + // now unsealed so we need to start sealing... + // + impl->state = SKC_COMPOSITION_STATE_SEALING; + + // + // the seal operation implies we should force start all dependencies + // that are still in a ready state + // + skc_grid_force(impl->grids.sort); +} + +// +// +// + +void +skc_composition_sort_execute_complete(struct skc_composition_impl * const impl) +{ + // we're sealed + impl->state = SKC_COMPOSITION_STATE_SEALED; + + // this grid is done + skc_grid_complete(impl->grids.sort); +} + +static +void +skc_composition_sort_execute_cb(cl_event event, cl_int status, struct skc_composition_impl * const impl) +{ + SKC_CL_CB(status); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_composition_sort_execute_complete,impl); +} + +static +void +skc_composition_sort_grid_pfn_execute(skc_grid_t const grid) +{ + struct skc_composition_impl * const impl = skc_grid_get_data(grid); + + // we should be sealing + assert(impl->state == SKC_COMPOSITION_STATE_SEALING); + + struct skc_place_atomics * const atomics = impl->atomics.hr; + +#ifndef NDEBUG + fprintf(stderr,"composition sort: %u\n",atomics->keys); +#endif + + if (atomics->keys > 0) + { + uint32_t keys_padded_in, keys_padded_out; + + hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); + + hs_sort(impl->cq, + impl->keys.drw, + impl->keys.drw, + atomics->keys, + keys_padded_in, + keys_padded_out, + false); + + cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw))); + cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw))); + cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw))); + + // find start of each tile + skc_device_enqueue_kernel(impl->runtime->device, + SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK, + impl->cq, + impl->kernels.segment, + atomics->keys, + 0,NULL,NULL); + } + + cl_event complete; + + // next stage needs to know number of key segments + skc_extent_phr_pdrw_read(&impl->atomics,impl->cq,&complete); + + // register a callback + cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_sort_execute_cb,impl)); + cl(ReleaseEvent(complete)); + + // flush cq + cl(Flush(impl->cq)); +} + +// +// +// + +static +void +skc_composition_raster_release(struct skc_composition_impl * const impl) +{ + // + // reference counts to rasters can only be released when the + // composition is unsealed and the atomics are reset. + // + skc_runtime_raster_device_release(impl->runtime, + impl->saved.extent.hrw, + impl->saved.count); + // reset count + impl->saved.count = 0; +} + +// +// +// + +static +void +skc_composition_unseal_block(struct skc_composition_impl * const impl, + skc_bool const block) +{ + // return if already unsealed + if (impl->state == SKC_COMPOSITION_STATE_UNSEALED) + return; + + // + // otherwise, we're going to need to pump the scheduler + // + struct skc_scheduler * const scheduler = impl->runtime->scheduler; + + // + // wait for UNSEALING > UNSEALED transition + // + if (impl->state == SKC_COMPOSITION_STATE_UNSEALING) + { + if (block) { + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED); + } + return; + } + + // + // wait for SEALING > SEALED transition ... + // + if (impl->state == SKC_COMPOSITION_STATE_SEALING) + { + // wait if sealing + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_SEALED); + } + + // wait for rendering locks to be released + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0); + + // + // no need to visit UNSEALING state with this implementation + // + + // acquire a new grid + impl->grids.sort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + NULL, // the composition state guards this + impl, + NULL, // no waiting + skc_composition_sort_grid_pfn_execute, + NULL); // no dispose + + // mark composition as unsealed + impl->state = SKC_COMPOSITION_STATE_UNSEALED; +} + +// +// can only be called on a composition that was just unsealed +// +static +void +skc_composition_reset(struct skc_composition_impl * const impl) +{ + // zero the atomics + skc_extent_phr_pdrw_zero(&impl->atomics,impl->cq,NULL); + + // flush it + cl(Flush(impl->cq)); + + // release all the rasters + skc_composition_raster_release(impl); +} + +static +void +skc_composition_unseal_block_reset(struct skc_composition_impl * const impl, + skc_bool const block, + skc_bool const reset) +{ + skc_composition_unseal_block(impl,block); + + if (reset) { + skc_composition_reset(impl); + } +} + +// +// +// + +static +void +skc_composition_pfn_unseal(struct skc_composition_impl * const impl, skc_bool const reset) +{ + skc_composition_unseal_block_reset(impl,false,reset); +} + +// +// only needs to create a grid +// + +static +void +skc_composition_place_create(struct skc_composition_impl * const impl) +{ + // acquire a grid + impl->grids.place = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + &impl->grids.place, + NULL, + NULL, // no waiting + skc_composition_place_grid_pfn_execute, + skc_composition_place_grid_pfn_dispose); + + // assign happens-after relationship + skc_grid_happens_after_grid(impl->grids.sort,impl->grids.place); +} + + +static +skc_err +skc_composition_pfn_place(struct skc_composition_impl * const impl, + skc_raster_t const * rasters, + skc_layer_id const * layer_ids, + skc_float const * txs, + skc_float const * tys, + skc_uint count) +{ + // block and yield if not unsealed + skc_composition_unseal_block(impl,true); + + // + // validate and retain all rasters + // + skc_err err; + + err = skc_runtime_handle_device_validate_retain(impl->runtime, + SKC_TYPED_HANDLE_TYPE_IS_RASTER, + rasters, + count); + if (err) + return err; + + skc_runtime_handle_device_retain(impl->runtime,rasters,count); + + // + // save the stripped handles + // + skc_raster_t * saved = impl->saved.extent.hrw; + + saved += impl->saved.count; + impl->saved.count += count; + + for (skc_uint ii=0; iiruntime->scheduler,(rem = skc_extent_ring_wip_rem(&impl->cmds.ring)) == 0); + + // append commands + skc_uint avail = min(rem,count); + + // decrement count + count -= avail; + + // launch a place kernel after copying commands? + skc_bool const is_wip_full = (avail == rem); + + // if there is no place grid then create one + if (impl->grids.place == NULL) + { + skc_composition_place_create(impl); + } + + // + // FIXME -- OPTIMIZATION? -- the ring_wip_index_inc() test can + // be avoided by splitting into at most two intervals. It should + // be plenty fast as is though so leave for now. + // + union skc_cmd_place * const cmds = impl->cmds.extent.hw1; + + if ((txs == NULL) && (tys == NULL)) + { + while (avail-- > 0) + { + skc_raster_t const raster = *saved++; + + skc_grid_happens_after_handle(impl->grids.place,raster); + + cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = + (union skc_cmd_place){ raster, *layer_ids++, 0, 0 }; + } + } + else if (txs == NULL) + { + while (avail-- > 0) + { + skc_raster_t const raster = *saved++; + + skc_grid_happens_after_handle(impl->grids.place,raster); + + cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = + (union skc_cmd_place){ raster, + *layer_ids++, + 0, + SKC_PLACE_CMD_TY_CONVERT(*tys++) }; + } + } + else if (tys == NULL) + { + while (avail-- > 0) + { + skc_raster_t const raster = *saved++; + + skc_grid_happens_after_handle(impl->grids.place,raster); + + cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = + (union skc_cmd_place){ raster, + *layer_ids++, + SKC_PLACE_CMD_TX_CONVERT(*txs++), + 0 }; + } + } + else + { + while (avail-- > 0) + { + skc_raster_t const raster = *saved++; + + skc_grid_happens_after_handle(impl->grids.place,raster); + + cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] = + (union skc_cmd_place){ raster, + *layer_ids++, + SKC_PLACE_CMD_TX_CONVERT(*txs++), + SKC_PLACE_CMD_TY_CONVERT(*tys++) }; + } + } + + // launch place kernel? + if (is_wip_full) { + skc_composition_snap(impl); + } + } while (count > 0); + + return SKC_ERR_SUCCESS; +} + +// +// +// + +static +void +skc_composition_pfn_bounds(struct skc_composition_impl * const impl, skc_int bounds[4]) +{ + // + // FIXME -- not implemented yet + // + // impl bounds will be copied back after sealing + // + bounds[0] = SKC_INT_MIN; + bounds[1] = SKC_INT_MIN; + bounds[2] = SKC_INT_MAX; + bounds[3] = SKC_INT_MAX; +} + +// +// +// + +void +skc_composition_retain_and_lock(struct skc_composition * const composition) +{ + skc_composition_retain(composition); + + composition->impl->lock_count += 1; +} + +void +skc_composition_unlock_and_release(struct skc_composition * const composition) +{ + composition->impl->lock_count -= 1; + + skc_composition_pfn_release(composition->impl); +} + +// +// +// + +skc_err +skc_composition_cl_12_create(struct skc_context * const context, + struct skc_composition * * const composition) +{ + struct skc_runtime * const runtime = context->runtime; + + // retain the context + // skc_context_retain(context); + + // allocate impl + struct skc_composition_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + // allocate composition + (*composition) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**composition)); + + (*composition)->context = context; + (*composition)->impl = impl; + (*composition)->ref_count = 1; + + (*composition)->place = skc_composition_pfn_place; + (*composition)->unseal = skc_composition_pfn_unseal; + (*composition)->seal = skc_composition_pfn_seal; + (*composition)->bounds = skc_composition_pfn_bounds; + (*composition)->release = skc_composition_pfn_release; + + // intialize impl + impl->composition = (*composition); + impl->runtime = runtime; + + SKC_ASSERT_STATE_INIT(impl,SKC_COMPOSITION_STATE_SEALED); + + impl->lock_count = 0; + + impl->grids.sort = NULL; + impl->grids.place = NULL; + + // acquire command queue for sealing/unsealing + impl->cq = skc_runtime_acquire_cq_in_order(runtime); + + // acquire kernels + impl->kernels.place = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PLACE); + impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK); + + // get config + struct skc_config const * const config = runtime->config; + + // initialize ring size with config values + skc_extent_ring_init(&impl->cmds.ring, + config->composition.cmds.elem_count, + config->composition.cmds.snap_count, + sizeof(union skc_cmd_place)); + + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->cmds.extent ,sizeof(union skc_cmd_place) * config->composition.cmds.elem_count); + skc_extent_phrw_alloc (runtime,&impl->saved.extent,sizeof(skc_raster_t) * config->composition.raster_ids.elem_count); + skc_extent_phr_pdrw_alloc (runtime,&impl->atomics ,sizeof(struct skc_place_atomics)); + + skc_extent_pdrw_alloc (runtime,&impl->keys ,sizeof(skc_ttxk_t) * config->composition.keys.elem_count); + skc_extent_pdrw_alloc (runtime,&impl->offsets ,sizeof(skc_uint) * (1u << SKC_TTCK_HI_BITS_YX)); // 1MB + + // nothing saved + impl->saved.count = 0; + + // unseal the composition, zero the atomics, etc. + skc_composition_unseal_block_reset(impl,false,true); + + return SKC_ERR_SUCCESS; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.h b/src/compute/skc/platforms/cl_12/composition_cl_12.h new file mode 100644 index 0000000000..4f52090658 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/composition_cl_12.h @@ -0,0 +1,105 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +#include "composition.h" +#include "assert_state.h" +#include "grid.h" +#include "extent_cl_12.h" +#include "extent_ring.h" + +// +// composition states +// + +typedef enum skc_composition_state_e { + + SKC_COMPOSITION_STATE_UNSEALING, + SKC_COMPOSITION_STATE_UNSEALED, + SKC_COMPOSITION_STATE_SEALING, + SKC_COMPOSITION_STATE_SEALED + +} skc_composition_state_e; + +// +// IMPL +// + +struct skc_composition_impl +{ + struct skc_composition * composition; + struct skc_runtime * runtime; + + SKC_ASSERT_STATE_DECLARE(skc_composition_state_e); + + skc_int lock_count; // wip renders + + struct { + skc_grid_t sort; + skc_grid_t place; + } grids; + + cl_command_queue cq; + + struct { + cl_kernel place; + cl_kernel segment; + } kernels; + + // raster ids must be held until the composition is reset or + // released and then their refcounts can be decremented + struct { + struct skc_extent_phrw extent; + skc_uint count; + } saved; + + struct { + struct skc_extent_ring ring; // how many slots left? + struct skc_extent_phw1g_tdrNs extent; // wip command extent + } cmds; + + // composition extent length + struct skc_extent_phr_pdrw atomics; + + // composition ttck extent + struct skc_extent_pdrw keys; + + // key offsets in sealed and sorted ttck extent + struct skc_extent_pdrw offsets; +}; + +// +// ATOMICS +// + +struct skc_place_atomics +{ + skc_uint keys; + skc_uint offsets; +}; + +// +// ONLY VISIBLE WITHIN THIS RUNTIME +// + +void +skc_composition_retain_and_lock(struct skc_composition * const composition); + +void +skc_composition_unlock_and_release(struct skc_composition * const composition); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/config_cl.h b/src/compute/skc/platforms/cl_12/config_cl.h new file mode 100644 index 0000000000..0172857b07 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/config_cl.h @@ -0,0 +1,147 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "runtime_cl.h" +#include "block_pool_cl.h" + +// +// FIXME -- define individual structs before defining skc_config +// + +struct skc_config +{ + struct { + struct { + skc_uint size; + skc_uint subbufs; + } host; // alignment determined by compiler + struct { + skc_uint size; + skc_uint subbufs; + } device; // alignment determined by device + } suballocator; + + struct { + skc_uint size; + } scheduler; + + struct { + skc_uint bytes; // bytes per subblock -- pow2 + skc_uint words; // words per subblock -- pow2 + // skc_uint words_log2; + } subblock; + + struct { + skc_uint bytes; // bytes per block -- pow2 + skc_uint words; // words per block -- pow2 + skc_uint subblocks; // subblocks per block -- block.bytes >= subblock.bytes + // skc_uint subblocks_log2; + } block; + + union skc_block_pool_size block_pool; + + struct { + skc_cq_type_e type; + skc_uint size; + } cq_pool; + + struct { + skc_uint size; // a large fraction of block pool size + skc_uint width; // determines number of launched reclamation subgroups + skc_uint recs; // how many in-flight width-subgroup reclamation grids + } handle_pool; + + struct { + skc_uint width; // tile width in pixels + skc_uint height; // tile height in pixels + skc_uint ratio; // subblocks per TTPB + } tile; + + struct { + struct { + skc_uint count; // # of subbufs in buffer + } buffer; + + struct { + skc_uint count; // # of blocks/commands in subbuf + } subbuf; + + struct { + size_t buffer; // block.bytes * subbuf.blocks * subbuf.count + size_t subbuf; // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN + } block; + + struct { + size_t buffer; // sizeof(skc_uint) * subbuf.blocks * subbuf.count + size_t subbuf; // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN + } command; + // + // skc_uint paths_lowat; + // + } paths_copy; + + struct { + struct { + skc_uint elem_count; + skc_uint snap_count; + } path_ids; + + struct { + skc_uint elem_count; + skc_uint snap_count; + } transforms; + + struct { + skc_uint elem_count; + skc_uint snap_count; + } clips; + + struct { + skc_uint elem_count; + skc_uint snap_count; + } fill; + + struct { + skc_uint elem_count; + skc_uint snap_count; + } raster_ids; + + struct { + skc_uint cmds; + } expand; + + struct { + skc_uint keys; + } rasterize; + } raster_cohort; + + struct { + struct { + skc_uint elem_count; + skc_uint snap_count; + } cmds; + + struct { + skc_uint elem_count; + } raster_ids; + + struct { + skc_uint elem_count; + } keys; + } composition; +}; + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.c b/src/compute/skc/platforms/cl_12/cq_pool_cl.c new file mode 100644 index 0000000000..80cfe34cf8 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c @@ -0,0 +1,152 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#ifndef NDEBUG +#include +#endif + +// +// +// + +#include + +// +// +// + +#include "runtime_cl_12.h" + +// +// This implementation is probably excessive. +// +// The command queue pool could easily be replaced with simply an LRU +// or even round-robin reuse pool. Even a small number of aliased +// command queues can probably enough concurrency. +// + +#define SKC_CQ_POOL_EXPAND 1 + +// +// +// + +void +skc_cq_pool_create(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool, + skc_uint const type, + skc_uint const size) +{ + pool->type = type; + pool->size = size + 1; // an empty spot + pool->reads = 0; + pool->writes = size; + pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq)); + + for (skc_uint ii=0; iicq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); + } + pool->cq[size] = NULL; +} + +// +// +// + +void +skc_cq_pool_dispose(struct skc_runtime * const runtime, + struct skc_cq_pool * pool) +{ + // + // FIXME -- release the command queues after waiting for the ring to + // be full with pool.size queues? + // + skc_runtime_host_perm_free(runtime,pool->cq); +} + +// +// +// + +static +void +skc_cq_pool_write(struct skc_cq_pool * const pool, + cl_command_queue cq) +{ + pool->cq[pool->writes++ % pool->size] = cq; +} + +// +// only expand when completely empty +// + +static +void +skc_cq_pool_expand(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool, + skc_uint expand) +{ +#ifndef NDEBUG + fprintf(stderr,"Expanding the cq_pool by: %u (%u)\n",expand,pool->size); +#endif + + // free old + skc_runtime_host_perm_free(runtime,pool->cq); + + // the ring is empty + pool->size += expand; + pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq)); + pool->reads = 0; + pool->writes = expand; + + for (skc_uint ii=0; iicq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); +} + +// +// +// + +static +cl_command_queue +skc_cq_pool_read(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool) +{ + // any command queues left? + if (pool->reads == pool->writes) + skc_cq_pool_expand(runtime,pool,SKC_CQ_POOL_EXPAND); + + cl_command_queue cq = pool->cq[pool->reads++ % pool->size]; + + return cq; +} + +// +// +// + +cl_command_queue +skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime) +{ + return skc_cq_pool_read(runtime,&runtime->cq_pool); +} + +void +skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, + cl_command_queue cq) +{ + skc_cq_pool_write(&runtime->cq_pool,cq); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.h b/src/compute/skc/platforms/cl_12/cq_pool_cl.h new file mode 100644 index 0000000000..0cc73a2f82 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h @@ -0,0 +1,46 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +#include "types.h" + +// +// Why we need to wrap command queue creation: +// +// - command queue creation is expensive +// +// - the CL 1.2 function is deprecated in 2.0 +// + +struct skc_cq_pool +{ + skc_cq_type_e type; + skc_uint size; + skc_uint reads; + skc_uint writes; + cl_command_queue * cq; +}; + +//l +// +// + +void +skc_cq_pool_create(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool, + skc_uint const type, + skc_uint const size); + +void +skc_cq_pool_dispose(struct skc_runtime * const runtime, + struct skc_cq_pool * pool); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/device_cl_12.h b/src/compute/skc/platforms/cl_12/device_cl_12.h new file mode 100644 index 0000000000..637b61ae10 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/device_cl_12.h @@ -0,0 +1,95 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +// +// +// + +#define SKC_CL_ARG(arg) sizeof(arg),&arg + +// +// +// + +typedef enum skc_device_kernel_id { + SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS, + SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS, + + SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, + SKC_DEVICE_KERNEL_ID_PATHS_COPY, + + SKC_DEVICE_KERNEL_ID_FILLS_EXPAND, + + SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL, + SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES, + SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS, + SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS, + SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS, + SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS, + + SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK, + SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC, + + SKC_DEVICE_KERNEL_ID_PREFIX, + SKC_DEVICE_KERNEL_ID_PLACE, + SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK, + + SKC_DEVICE_KERNEL_ID_RENDER, + + SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM, + SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM, + + // + SKC_DEVICE_KERNEL_ID_COUNT + +} skc_device_kernel_id; + +// +// +// + +void +skc_device_create(struct skc_runtime * const runtime); + + +void +skc_device_dispose(struct skc_runtime * const runtime); + + +// +// multi-threading/context/device requires multiple kernel instances +// + +cl_kernel +skc_device_acquire_kernel(struct skc_device * const device, + skc_device_kernel_id const type); + +// +// grid shape can vary greatly by target platform +// +void +skc_device_enqueue_kernel(struct skc_device * const device, + skc_device_kernel_id const type, + cl_command_queue cq, + cl_kernel kernel, + size_t const work_size, + cl_uint num_events_in_wait_list, + cl_event const * const event_wait_list, + cl_event * const event); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/export_cl_12.h b/src/compute/skc/platforms/cl_12/export_cl_12.h new file mode 100644 index 0000000000..e577282791 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/export_cl_12.h @@ -0,0 +1,63 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "skc.h" + +// +// +// + +skc_err +skc_path_builder_cl_12_create(struct skc_context * const context, + struct skc_path_builder * * const path_builder); + +// +// +// + +skc_err +skc_raster_builder_cl_12_create(struct skc_context * const context, + struct skc_raster_builder * * const raster_builder); + +// +// +// + +skc_err +skc_composition_cl_12_create(struct skc_context * const context, + struct skc_composition * * const composition); + +// +// +// + +skc_err +skc_styling_cl_12_create(struct skc_context * const context, + struct skc_styling * * const styling, + uint32_t const layers_count, + uint32_t const groups_count, + uint32_t const extras_count); + +// +// +// + +skc_err +skc_surface_cl_12_create(struct skc_context * const context, + struct skc_surface * * const surface); + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.c b/src/compute/skc/platforms/cl_12/extent_cl_12.c new file mode 100644 index 0000000000..73676d8063 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/extent_cl_12.c @@ -0,0 +1,459 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include + +#include "common/cl/assert_cl.h" +#include "extent_cl_12.h" +#include "runtime_cl_12.h" + +// +// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY +// + +void +skc_extent_phrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrw * const extent, + size_t const size) +{ + extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); +} + +void +skc_extent_phrw_free(struct skc_runtime * const runtime, + struct skc_extent_phrw * const extent) +{ + skc_runtime_host_perm_free(runtime,extent->hrw); +} + +// +// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP +// + +void +skc_extent_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_pdrw * const extent, + size_t const size) +{ + extent->drw = skc_runtime_device_perm_alloc(runtime, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + size); +} + +void +skc_extent_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_pdrw * const extent) +{ + skc_runtime_device_perm_free(runtime,extent->drw); +} + +// +// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING +// + +void +skc_extent_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_tdrw * const extent, + size_t const size) +{ + extent->size = size; + extent->drw = skc_runtime_device_temp_alloc(runtime, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + size,&extent->id,NULL); +} + +void +skc_extent_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_tdrw * const extent) +{ + skc_runtime_device_temp_free(runtime,extent->drw,extent->id); +} + +void +skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + if (extent->size == 0) + return; + + skc_uint const zero = 0; + + cl(EnqueueFillBuffer(cq, + extent->drw, + &zero, + sizeof(zero), + 0, + extent->size, + 0,NULL,event)); +} + +// +// DURABLE SMALL EXTENTS BACKING ATOMICS +// + +void +skc_extent_phr_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phr_pdrw * const extent, + size_t const size) +{ + extent->size = size; + extent->hr = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_ONLY,size); + extent->drw = skc_runtime_device_perm_alloc(runtime,CL_MEM_READ_WRITE,size); +} + +void +skc_extent_phr_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_phr_pdrw * const extent) +{ + skc_runtime_host_perm_free(runtime,extent->hr); + skc_runtime_device_perm_free(runtime,extent->drw); +} + +void +skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + if (extent->size == 0) + return; + + cl(EnqueueReadBuffer(cq, + extent->drw, + CL_FALSE, + 0, + extent->size, + extent->hr, + 0,NULL,event)); +} + +void +skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + if (extent->size == 0) + return; + + skc_uint const zero = 0; + + cl(EnqueueFillBuffer(cq, + extent->drw, + &zero, + sizeof(zero), + 0, + extent->size, + 0,NULL,event)); +} + +// +// EPHEMERAL SMALL EXTENTS BACKING ATOMICS +// + +void +skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_thr_tdrw * const extent, + size_t const size) +{ + extent->size = size; + extent->hr = skc_runtime_host_temp_alloc(runtime, + SKC_MEM_FLAGS_READ_WRITE, + size,&extent->id.hr,NULL); + extent->drw = skc_runtime_device_temp_alloc(runtime, + CL_MEM_READ_WRITE, + size, + &extent->id.drw, + NULL); +} + +void +skc_extent_thr_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_thr_tdrw * const extent) +{ + skc_runtime_host_temp_free(runtime,extent->hr,extent->id.hr); + skc_runtime_device_temp_free(runtime,extent->drw,extent->id.drw); +} + +void +skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + if (extent->size == 0) + return; + + cl(EnqueueReadBuffer(cq, + extent->drw, + CL_FALSE, + 0, + extent->size, + extent->hr, + 0,NULL,event)); +} + +void +skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + if (extent->size == 0) + return; + + skc_uint const zero = 0; + + cl(EnqueueFillBuffer(cq, + extent->drw, + &zero, + sizeof(zero), + 0, + extent->size, + 0,NULL,event)); +} + +// +// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT +// + +void +skc_extent_phw1g_tdrNs_alloc(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent, + size_t const size) +{ + extent->hw1 = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_WRITE_ONLY,size); +} + +void +skc_extent_phw1g_tdrNs_free(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent) +{ + skc_runtime_host_perm_free(runtime,extent->hw1); +} + +void +skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phw1g_tdrNs_snap * const snap) +{ + snap->snap = skc_extent_ring_snap_alloc(runtime,ring); +} + +void +skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent, + struct skc_extent_phw1g_tdrNs_snap * const snap, + cl_command_queue const cq, + cl_event * const event) +{ + struct skc_extent_ring const * const ring = snap->snap->ring; + + skc_uint const count = skc_extent_ring_snap_count(snap->snap); + size_t const size = count * ring->size.elem; + + snap->drN = skc_runtime_device_temp_alloc(runtime, + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, + size,&snap->id,NULL); + + if (count == 0) + return; + + // possibly two copies + skc_uint const index_lo = snap->snap->reads & ring->size.mask; + skc_uint const count_max = ring->size.pow2 - index_lo; + skc_uint const count_lo = min(count_max,count); + size_t const bytes_lo = count_lo * ring->size.elem; + + if (count > count_max) + { + skc_uint const bytes_hi = (count - count_max) * ring->size.elem; + + cl(EnqueueWriteBuffer(cq, + snap->drN, + CL_FALSE, + bytes_lo, + bytes_hi, + extent->hw1, // offset_hi = 0 + 0,NULL,NULL)); + } + + size_t const offset_lo = index_lo * ring->size.elem; + + cl(EnqueueWriteBuffer(cq, + snap->drN, + CL_FALSE, + 0, + bytes_lo, + (skc_uchar*)extent->hw1 + offset_lo, + 0,NULL,event)); + +} + +void +skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs_snap * const snap) +{ + skc_runtime_device_temp_free(runtime,snap->drN,snap->id); + skc_extent_ring_snap_free(runtime,snap->snap); +} + +// +// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT +// + +void +skc_extent_phrwg_tdrNs_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent, + size_t const size) +{ + extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE +} + +void +skc_extent_phrwg_tdrNs_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent) +{ + skc_runtime_host_perm_free(runtime,extent->hrw); +} + +void +skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phrwg_tdrNs_snap * const snap) +{ + snap->snap = skc_extent_ring_snap_alloc(runtime,ring); +} + +void +skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent, + struct skc_extent_phrwg_tdrNs_snap * const snap, + cl_command_queue const cq, + cl_event * const event) +{ + struct skc_extent_ring const * const ring = snap->snap->ring; + + skc_uint const count = skc_extent_ring_snap_count(snap->snap); + size_t const size = count * ring->size.elem; + + snap->drN = skc_runtime_device_temp_alloc(runtime, + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, + size,&snap->id,NULL); + + if (count == 0) + return; + + // possibly two copies + skc_uint const index_lo = snap->snap->reads & ring->size.mask; + skc_uint const count_max = ring->size.pow2 - index_lo; + skc_uint const count_lo = min(count_max,count); + size_t const bytes_lo = count_lo * ring->size.elem; + + if (count > count_max) + { + skc_uint const count_hi = count - count_max; + skc_uint const bytes_hi = count_hi * ring->size.elem; + + cl(EnqueueWriteBuffer(cq, + snap->drN, + CL_FALSE, + bytes_lo, + bytes_hi, + extent->hrw, // offset_hi = 0 + 0,NULL,NULL)); + } + + size_t offset_lo = index_lo * ring->size.elem; + + cl(EnqueueWriteBuffer(cq, + snap->drN, + CL_FALSE, + 0, + bytes_lo, + (skc_uchar*)extent->hrw + offset_lo, + 0,NULL,event)); + +} + +void +skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs_snap * const snap) +{ + skc_runtime_device_temp_free(runtime,snap->drN,snap->id); + skc_extent_ring_snap_free(runtime,snap->snap); +} + +// +// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT +// +// Note that because the ring and snapshot are both in host memory and +// the snapshot blocks progress until freed we can simply point the +// fake ephemeral snapshot at the ring's durable extent. +// + +void +skc_extent_phrwg_thr1s_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent, + size_t const size) +{ + extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE +} + +void +skc_extent_phrwg_thr1s_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent) +{ + skc_runtime_host_perm_free(runtime,extent->hrw); +} + +void +skc_extent_phrwg_thr1s_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phrwg_thr1s_snap * const snap) +{ + snap->snap = skc_extent_ring_snap_alloc(runtime,ring); +} + +void +skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent, + struct skc_extent_phrwg_thr1s_snap * const snap) +{ + struct skc_extent_ring const * const ring = snap->snap->ring; + + skc_uint const count = skc_extent_ring_snap_count(snap->snap); + skc_uint const index_lo = snap->snap->reads & ring->size.mask; + skc_uint const count_max = ring->size.pow2 - index_lo; + + snap->count.lo = min(count_max,count); + snap->hr1.lo = (skc_uchar*)extent->hrw + (index_lo * ring->size.elem); + + if (count > count_max) + { + snap->count.hi = count - count_max; + snap->hr1.hi = extent->hrw; + } + else + { + snap->count.hi = 0; + snap->hr1.hi = NULL; + } +} + +void +skc_extent_phrwg_thr1s_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s_snap * const snap) +{ + skc_extent_ring_snap_free(runtime,snap->snap); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.h b/src/compute/skc/platforms/cl_12/extent_cl_12.h new file mode 100644 index 0000000000..47ba951bb3 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/extent_cl_12.h @@ -0,0 +1,476 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +#include "suballocator.h" +#include "extent_ring.h" + +// +// Legend: +// +// p : durable +// t : ephemeral +// h : host +// d : device +// r : read +// w : write +// 1 : once -- e.g. w1 is 'write-once' +// N : many -- e.g. rN is 'read-many' +// g : ring +// s : ring snapshot +// +// Notes: +// +// rw : for now, read-write implies read-write many +// + +// +// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY +// + +struct skc_extent_phrw +{ + void * hrw; +}; + +void +skc_extent_phrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrw * const extent, + size_t const size); + +void +skc_extent_phrw_free(struct skc_runtime * const runtime, + struct skc_extent_phrw * const extent); + +// +// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP +// + +struct skc_extent_pdrw +{ + cl_mem drw; +}; + +void +skc_extent_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_pdrw * const extent, + size_t const size); + +void +skc_extent_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_pdrw * const extent); + +// +// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING +// + +struct skc_extent_tdrw +{ + size_t size; + cl_mem drw; + skc_subbuf_id_t id; +}; + +void +skc_extent_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_tdrw * const extent, + size_t const size); + +void +skc_extent_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_tdrw * const extent); + +void +skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +// +// DURABLE SMALL EXTENTS BACKING ATOMICS +// + +struct skc_extent_phr_pdrw +{ + size_t size; // must be multiple of words + void * hr; + cl_mem drw; +}; + +void +skc_extent_phr_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phr_pdrw * const extent, + size_t const size); + +void +skc_extent_phr_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_phr_pdrw * const extent); + +void +skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +// +// EPHEMERAL SMALL EXTENTS BACKING ATOMICS +// + +struct skc_extent_thr_tdrw +{ + size_t size; // must be multiple of words + + void * hr; + cl_mem drw; + + struct { + skc_subbuf_id_t hr; + skc_subbuf_id_t drw; + } id; +}; + +void +skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_thr_tdrw * const extent, + size_t const size); + +void +skc_extent_thr_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_thr_tdrw * const extent); + +void +skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +// +// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT +// + +struct skc_extent_phw1g_tdrNs +{ + void * hw1; +}; + +struct skc_extent_phw1g_tdrNs_snap +{ + struct skc_extent_ring_snap * snap; + cl_mem drN; + skc_subbuf_id_t id; +}; + +void +skc_extent_phw1g_tdrNs_alloc(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent, + size_t const size); + +void +skc_extent_phw1g_tdrNs_free(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent); + +void +skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phw1g_tdrNs_snap * const snap); + +void +skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs * const extent, + struct skc_extent_phw1g_tdrNs_snap * const snap, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phw1g_tdrNs_snap * const snap); + +// +// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT +// + +struct skc_extent_phrwg_tdrNs +{ + void * hrw; +}; + +struct skc_extent_phrwg_tdrNs_snap +{ + struct skc_extent_ring_snap * snap; + cl_mem drN; + skc_subbuf_id_t id; +}; + +void +skc_extent_phrwg_tdrNs_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent, + size_t const size); + +void +skc_extent_phrwg_tdrNs_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent); + +void +skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phrwg_tdrNs_snap * const snap); + +void +skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs * const extent, + struct skc_extent_phrwg_tdrNs_snap * const snap, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_tdrNs_snap * const snap); + +// +// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT +// +// Note that because the ring and snapshot are both in host memory and +// the snapshot blocks progress until freed we can simply point the +// fake ephemeral snapshot at the ring's durable extent. +// + +struct skc_extent_phrwg_thr1s +{ + void * hrw; +}; + +struct skc_extent_phrwg_thr1s_snap +{ + struct skc_extent_ring_snap * snap; + + struct { + skc_uint lo; + skc_uint hi; + } count; + + struct { + void * lo; + void * hi; + } hr1; +}; + +void +skc_extent_phrwg_thr1s_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent, + size_t const size); + +void +skc_extent_phrwg_thr1s_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent); + +void +skc_extent_phrwg_thr1s_snap_init(struct skc_runtime * const runtime, + struct skc_extent_ring * const ring, + struct skc_extent_phrwg_thr1s_snap * const snap); + +void +skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s * const extent, + struct skc_extent_phrwg_thr1s_snap * const snap); + +void +skc_extent_phrwg_thr1s_snap_free(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s_snap * const snap); + +// +// EPHEMERAL MAPPING +// +// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +#if 0 +struct skc_extent_thrw_tdrw +{ + size_t size; + cl_mem drw; + skc_subbuf_id_t id; +}; + +void +skc_extent_thrw_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_thrw_tdrw * const extent, + size_t const size); + +void +skc_extent_thrw_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_thrw_tdrw * const extent); + +void * +skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event); + +void * +skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent, + void * const hrN, + cl_command_queue const cq, + cl_event * const event); +#endif + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +struct skc_extent_phrw_pdrw +{ + size_t size; + cl_mem drw; +}; + +void +skc_extent_phrw_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrw_pdrw * const extent, + size_t const size); + +void +skc_extent_phrw_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_phrw_pdrw * const extent); + +void * +skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event); + +void * +skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent, + void * const hrN, + cl_command_queue const cq, + cl_event * const event); + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO R/O HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +struct skc_extent_phrN_pdwN +{ + size_t size; + cl_mem dwN; +}; + +void +skc_extent_phrN_pdwN_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrN_pdwN * const extent, + size_t const size); + +void +skc_extent_phrN_pdwN_free(struct skc_runtime * const runtime, + struct skc_extent_phrN_pdwN * const extent); + +void * +skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event); + +void * +skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent, + void * const hrN, + cl_command_queue const cq, + cl_event * const event); + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO W/O HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +struct skc_extent_phwN_pdrN +{ + size_t size; + cl_mem drN; +}; + +void +skc_extent_phwN_pdrN_alloc(struct skc_runtime * const runtime, + struct skc_extent_phwN_pdrN * const extent, + size_t const size); + +void +skc_extent_phwN_pdrN_free(struct skc_runtime * const runtime, + struct skc_extent_phwN_pdrN * const extent); + +void * +skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event); + +void * +skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent, + cl_command_queue const cq, + cl_event * const event); + +void +skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent, + void * const hwm, + cl_command_queue const cq, + cl_event * const event); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c new file mode 100644 index 0000000000..69c669ad54 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c @@ -0,0 +1,281 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT NONE OF THESE EXTENTS CHECK FOR ZERO-SIZED ALLOCATIONS. +// THAT'S OK FOR NOW. +// + +#include + +#include "runtime_cl_12.h" +#include "extent_cl_12.h" +#include "common/cl/assert_cl.h" + +// +// EPHEMERAL MAPPING +// +// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +#if 0 + +#pragma message("struct skc_extent_thrw_tdrw will be removed once the sorter is installed.") + +void +skc_extent_thrw_tdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_thrw_tdrw * const extent, + size_t const size) +{ + extent->drw = skc_runtime_device_temp_alloc(runtime, + CL_MEM_READ_WRITE /* | CL_MEM_ALLOC_HOST_PTR */, + size,&extent->id,&extent->size); +} + +void +skc_extent_thrw_tdrw_free(struct skc_runtime * const runtime, + struct skc_extent_thrw_tdrw * const extent) +{ + skc_runtime_device_temp_free(runtime,extent->drw,extent->id); +} + +void * +skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event) +{ + cl_int cl_err; + + void * hrw = clEnqueueMapBuffer(cq,extent->drw, + CL_FALSE, + CL_MAP_READ | CL_MAP_WRITE,0,size, + 0,NULL,event,&cl_err); cl_ok(cl_err); + + return hrw; +} + +void * +skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + return skc_extent_thrw_tdrw_map_size(extent,extent->size,cq,event); +} + +void +skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent, + void * const hrw, + cl_command_queue const cq, + cl_event * const event) +{ + cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event)); +} + +#endif + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO R/W HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +void +skc_extent_phrw_pdrw_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrw_pdrw * const extent, + size_t const size) +{ + cl_int cl_err; + + extent->size = size; + extent->drw = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + size,NULL,&cl_err); cl_ok(cl_err); +} + +void +skc_extent_phrw_pdrw_free(struct skc_runtime * const runtime, + struct skc_extent_phrw_pdrw * const extent) +{ + cl(ReleaseMemObject(extent->drw)); +} + +void * +skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event) +{ + cl_int cl_err; + + void * hrw = clEnqueueMapBuffer(cq,extent->drw, + CL_FALSE, + CL_MAP_READ | CL_MAP_WRITE,0,size, + 0,NULL,event,&cl_err); cl_ok(cl_err); + + return hrw; +} + +void * +skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + return skc_extent_phrw_pdrw_map_size(extent,extent->size,cq,event); +} + +void +skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent, + void * const hrw, + cl_command_queue const cq, + cl_event * const event) +{ + cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event)); +} + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO R/O HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +void +skc_extent_phrN_pdwN_alloc(struct skc_runtime * const runtime, + struct skc_extent_phrN_pdwN * const extent, + size_t const size) +{ + cl_int cl_err; + + extent->size = size; + extent->dwN = clCreateBuffer(runtime->cl.context, + CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + size,NULL,&cl_err); cl_ok(cl_err); +} + +void +skc_extent_phrN_pdwN_free(struct skc_runtime * const runtime, + struct skc_extent_phrN_pdwN * const extent) +{ + cl(ReleaseMemObject(extent->dwN)); +} + +void * +skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event) +{ + cl_int cl_err; + + void * hrN = clEnqueueMapBuffer(cq,extent->dwN, + CL_FALSE, + CL_MAP_READ,0,size, + 0,NULL,event,&cl_err); cl_ok(cl_err); + + return hrN; +} + +void * +skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + return skc_extent_phrN_pdwN_map_size(extent,extent->size,cq,event); +} + +void +skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent, + void * const hrN, + cl_command_queue const cq, + cl_event * const event) +{ + cl(EnqueueUnmapMemObject(cq,extent->dwN,hrN,0,NULL,event)); +} + +// +// DURABLE MAPPING +// +// ENTIRE EXTENT MAPPED TO W/O HOST MEMORY +// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY +// +// Note: integrated vs. discrete GPUs will have different +// implementations because we don't want a GPU kernel repeatedly +// accessing pinned memory. +// + +void +skc_extent_phwN_pdrN_alloc(struct skc_runtime * const runtime, + struct skc_extent_phwN_pdrN * const extent, + size_t const size) +{ + cl_int cl_err; + + extent->size = size; + extent->drN = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + size,NULL,&cl_err); cl_ok(cl_err); +} + +void +skc_extent_phwN_pdrN_free(struct skc_runtime * const runtime, + struct skc_extent_phwN_pdrN * const extent) +{ + cl(ReleaseMemObject(extent->drN)); +} + +void * +skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent, + size_t const size, + cl_command_queue const cq, + cl_event * const event) +{ + cl_int cl_err; + + void * hwN = clEnqueueMapBuffer(cq,extent->drN, + CL_FALSE, + CL_MAP_WRITE,0,size, + 0,NULL,event,&cl_err); cl_ok(cl_err); + + return hwN; +} + +void * +skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent, + cl_command_queue const cq, + cl_event * const event) +{ + return skc_extent_phwN_pdrN_map_size(extent,extent->size,cq,event); +} + +void +skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent, + void * const hwN, + cl_command_queue const cq, + cl_event * const event) +{ + cl(EnqueueUnmapMemObject(cq,extent->drN,hwN,0,NULL,event)); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/gl/interop.c b/src/compute/skc/platforms/cl_12/gl/interop.c new file mode 100644 index 0000000000..6697bb7e83 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/gl/interop.c @@ -0,0 +1,629 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include + +// +// +// + +#include +#include +#include +#include + +// +// +// + +#include "common/cl/assert_cl.h" +#include "types.h" + +// +// +// + +#include "interop.h" +#include "context.h" +#include "runtime_cl_12.h" + +// +// +// + +#include "svg2skc/transform_stack.h" + +// +// +// + +#if 1 +#define SKC_IMAGE_FORMAT GL_RGBA8 +#else +#define SKC_IMAGE_FORMAT GL_RGBA16F +#endif + +// +// +// + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +// +// +// + +struct skc_interop_fb +{ + cl_context context; + + GLuint fbo; + GLuint rbo; + + cl_mem mem; + + int width; + int height; + + bool is_srgb; + bool is_vsync_on; + bool is_fullscreen; + bool is_iconified; + bool is_resized; + bool is_spinning; + bool is_info; + + skc_float scale; + skc_float2 translate; + float rotate_theta; +}; + +static struct skc_interop_fb fb = + { + .mem = NULL, + + .is_srgb = true, + .is_vsync_on = false, + .is_fullscreen = false, + .is_iconified = false, + .is_resized = true, + .is_spinning = false, + .is_info = false, + + .scale = 1.0f, + .translate = { 0.0f, 0.0f }, + .rotate_theta = 0.0f + }; + +// +// FPS COUNTER FROM HERE: +// +// http://antongerdelan.net/opengl/glcontext2.html +// + +static +void +skc_interop_fps(GLFWwindow * window) +{ + if (fb.is_fullscreen) + return; + + // static fps counters + static double stamp_prev = 0.0; + static int frame_count = 0; + + // locals + double const stamp_curr = glfwGetTime(); + double const elapsed = stamp_curr - stamp_prev; + + if (elapsed >= 0.5) + { + stamp_prev = stamp_curr; + + double const fps = (double)frame_count / elapsed; + + char tmp[64]; + + sprintf_s(tmp,64,"(%d x %d) - VSync %s - sRGB %s - FPS: %.2f", + fb.width,fb.height, + fb.is_vsync_on ? "ON" : "OFF", + fb.is_srgb ? "ENABLED" : "DISABLED", + fps); + + glfwSetWindowTitle(window,tmp); + + frame_count = 0; + } + + frame_count++; +} + +// +// INITIALIZE GLFW/GLAD +// + +static +void +skc_interop_error_callback(int error, char const * description) +{ + fputs(description,stderr); +} + +// +// +// + +static +void +skc_interop_iconify_callback(GLFWwindow * window, int iconified) +{ + fb.is_iconified = iconified; +} + +// +// +// + +static +void +skc_interop_key_callback(GLFWwindow * window, int key, int scancode, int action, int mods) +{ + if (action == GLFW_RELEASE) + return; + + switch (key) + { + case GLFW_KEY_EQUAL: + fb.rotate_theta = 0.0f; + break; + + case GLFW_KEY_I: + fb.is_info = true; + break; + + case GLFW_KEY_R: + fb.is_spinning ^= true; + break; + + case GLFW_KEY_S: + fb.is_srgb ^= true; + if (fb.is_srgb) + glEnable(GL_FRAMEBUFFER_SRGB); + else + glDisable(GL_FRAMEBUFFER_SRGB); + break; + + case GLFW_KEY_V: + fb.is_vsync_on ^= true; + glfwSwapInterval(fb.is_vsync_on ? 1 : 0); + break; + + case GLFW_KEY_W: + glfwSetWindowSize(window,1024,1024); + break; + + case GLFW_KEY_ESCAPE: + glfwSetWindowShouldClose(window,GL_TRUE); + break; + } +} + +static +void +skc_interop_window_size_callback(GLFWwindow * window, int width, int height) +{ + fb.width = width; + fb.height = height; + fb.is_resized = true; + +#if 0 + skc_render_kernel_set_clip(0,0,width,height); +#endif +} + +static +void +skc_interop_scale(double const scale_offset) +{ +#define SKC_SCALE_FACTOR 1.05 + + static double scale_exp = 0.0; + + scale_exp += scale_offset; + fb.scale = (float)pow(SKC_SCALE_FACTOR,scale_exp); +} + +static +void +skc_interop_scroll_callback(GLFWwindow * window, double xoffset, double yoffset) +{ + bool const ctrl = + (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL) == GLFW_PRESS) || + (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS); + + if (!ctrl) + return; + + skc_interop_scale(yoffset); +} + +static +void +skc_interop_translate(float const dx, float const dy) +{ + float const dx_scaled = dx / fb.scale; + float const dy_scaled = dy / fb.scale; + + float const cos_theta = cosf(fb.rotate_theta); // replace with cospi if available + float const sin_theta = sinf(fb.rotate_theta); // replace with sinpi if available + + fb.translate.x += dx_scaled*cos_theta + dy_scaled*sin_theta; + fb.translate.y += dy_scaled*cos_theta - dx_scaled*sin_theta; +} + +static +void +skc_interop_cursor_position_callback(GLFWwindow * window, double x, double y) +{ + int const state = glfwGetMouseButton(window,GLFW_MOUSE_BUTTON_LEFT); + + static bool is_mouse_dragging = false; + static float x_prev=0.0, y_prev=0.0; + + float const mx = (float)x; + float const my = (float)y; + + if (state == GLFW_PRESS) + { + if (is_mouse_dragging) + { + const bool ctrl = + (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL) == GLFW_PRESS) || + (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS); + + if (ctrl) + { + float const cx = 0.5f * fb.width; + float const cy = 0.5f * fb.height; + + // find angle between mouse and center + float const vx = x_prev - cx; + float const vy = y_prev - cy; + + float const wx = mx - cx; + float const wy = my - cy; + + float const len = sqrtf((vx*vx + vy*vy) * (wx*wx + wy*wy)); + + if (len > 0.0f) + { + float const dot = vx*wx + vy*wy; + float const da = acosf(dot / len); + + if (vx*wy - vy*wx >= 0.0f) + fb.rotate_theta += da; + else + fb.rotate_theta -= da; + + fb.rotate_theta = fmodf(fb.rotate_theta,(float)(M_PI*2.0)); + } + } + else + { + skc_interop_translate(mx - x_prev, + my - y_prev); + } + } + else + { + is_mouse_dragging = true; + } + + x_prev = mx; + y_prev = my; + } + else + { + is_mouse_dragging = false; + } +} + +// +// +// + +static +void +skc_interop_resize() +{ + fb.is_resized = false; + + // release the image2d + if (fb.mem != NULL) + cl(ReleaseMemObject(fb.mem)); + + // resize rbo + glNamedRenderbufferStorage(fb.rbo, + SKC_IMAGE_FORMAT, + fb.width, + fb.height); + + // attach rbo to fbo + glNamedFramebufferRenderbuffer(fb.fbo, + GL_COLOR_ATTACHMENT0, + GL_RENDERBUFFER, + fb.rbo); + // + // + // + cl_int cl_err; + + fb.mem = clCreateFromGLRenderbuffer(fb.context, + CL_MEM_WRITE_ONLY, + fb.rbo, + &cl_err); cl_ok(cl_err); + // + // for debugging porpoises! + // + cl_image_format format; + + cl(GetImageInfo(fb.mem, + CL_IMAGE_FORMAT, + sizeof(format), + &format, + NULL)); +} + +// +// +// + +static +void +skc_interop_acquire() +{ + // frame buffer object + glCreateFramebuffers(1,&fb.fbo); + + // render buffer object w/a color buffer + glCreateRenderbuffers(1,&fb.rbo); + + // size rbo + glNamedRenderbufferStorage(fb.rbo, + SKC_IMAGE_FORMAT, + fb.width, + fb.height); + + // attach rbo to fbo + glNamedFramebufferRenderbuffer(fb.fbo, + GL_COLOR_ATTACHMENT0, + GL_RENDERBUFFER, + fb.rbo); +} + +void +skc_interop_register(skc_context_t context) +{ + fb.context = context->runtime->cl.context; +} + +// +// +// + +void +skc_interop_init(GLFWwindow * * window) +{ + // + // INITIALIZE GLFW/GLAD + // + glfwSetErrorCallback(skc_interop_error_callback); + + if (!glfwInit()) + exit(EXIT_FAILURE); + + GLFWmonitor * const primary = glfwGetPrimaryMonitor(); + GLFWvidmode const * const mode = glfwGetVideoMode(primary); + + if (fb.is_fullscreen) + { + fb.width = mode->width; + fb.height = mode->height; + } + else + { + fb.width = 1600; + fb.height = 1024; + } + + glfwWindowHint(GLFW_ALPHA_BITS, 0); + glfwWindowHint(GLFW_DEPTH_BITS, 0); + glfwWindowHint(GLFW_STENCIL_BITS, 0); + + glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); + + glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); + glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); + + glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); + + *window = glfwCreateWindow(fb.width,fb.height, + "Skia Compute", + fb.is_fullscreen ? primary : NULL, + NULL); + + if (*window == NULL) + { + glfwTerminate(); + exit(EXIT_FAILURE); + } + + glfwMakeContextCurrent(*window); + + // set up GLAD + gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); + + // ignore vsync for now + glfwSwapInterval(fb.is_vsync_on ? 1 : 0); + + // only copy r/g/b + glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); + + // enable SRGB, disable scissor + glEnable(GL_FRAMEBUFFER_SRGB); + glDisable(GL_SCISSOR_TEST); + + // + // SET USER POINTER AND CALLBACKS + // + glfwSetKeyCallback (*window,skc_interop_key_callback); + glfwSetFramebufferSizeCallback(*window,skc_interop_window_size_callback); + glfwSetScrollCallback (*window,skc_interop_scroll_callback); + glfwSetCursorPosCallback (*window,skc_interop_cursor_position_callback); + glfwSetWindowIconifyCallback (*window,skc_interop_iconify_callback); + + // + // + // + fprintf(stderr, + "GL_VENDOR : %s\n" + "GL_RENDERER : %s\n", + glGetString(GL_VENDOR), + glGetString(GL_RENDERER)); + + // + // acquire an FBO/RBO + // + skc_interop_acquire(); +} + +// +// +// + +#define SKC_ROTATE_STEP ((float)(M_PI / 180.0)) + +static +void +skc_interop_transform(struct skc_transform_stack * ts) +{ + // OpenGL'ism + skc_transform_stack_push_affine(ts, + 1.0f, 0.0f,0.0f, + 0.0f,-1.0f,(float)fb.height); + // multiply + skc_transform_stack_concat(ts); + + // spinner... + if (fb.is_spinning) + fb.rotate_theta = fmodf(fb.rotate_theta + SKC_ROTATE_STEP,(float)(M_PI*2.0)); + + // always rotate and scale around surface center point + skc_transform_stack_push_rotate_scale_xy(ts, + fb.rotate_theta, + fb.scale,fb.scale, + 0.5f*fb.width,0.5f*fb.height); + skc_transform_stack_concat(ts); + + // where did the mouse take us? + skc_transform_stack_push_translate(ts, + fb.translate.x,fb.translate.y); + skc_transform_stack_concat(ts); +} + + +void +skc_interop_poll(GLFWwindow * window, + struct skc_transform_stack * ts) +{ + // wait until uniconified + while (fb.is_iconified) + { + glfwWaitEvents(); + continue; + } + + // what's happended? + glfwPollEvents(); + + // resize? + if (fb.is_resized) + skc_interop_resize(); + + // monitor fps + skc_interop_fps(window); + + skc_interop_transform(ts); +} + +// +// +// + +void +skc_interop_blit(GLFWwindow * window) +{ + // blit skc rbo + glBlitNamedFramebuffer(fb.fbo,0, + 0,0,fb.width,fb.height, + 0,0,fb.width,fb.height, + GL_COLOR_BUFFER_BIT, + GL_NEAREST); + +#if 0 + // + // FIXME -- this clear does nothing! + // + // As a hack we're clearing the interop'd RBO with a + // clEnqueueFillImage(). + // + float const rgba[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + // GLenum const attachments[] = { GL_COLOR_ATTACHMENT0 }; + // glInvalidateNamedFramebufferData(fb.fbo,1,attachments); + glClearNamedFramebufferfv(fb.fbo,GL_COLOR,0,rgba); +#endif + + // swap buffers + glfwSwapBuffers(window); +} + +// +// +// + +void * +skc_interop_get_fb(GLFWwindow * window) +{ + glFlush(); + + return fb.mem; +} + +// +// +// + +void +skc_interop_get_dim(uint32_t dim[2]) +{ + dim[0] = fb.width; + dim[1] = fb.height; +} + +// +// +// + + diff --git a/src/compute/skc/platforms/cl_12/gl/interop.h b/src/compute/skc/platforms/cl_12/gl/interop.h new file mode 100644 index 0000000000..112d365764 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/gl/interop.h @@ -0,0 +1,42 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "skc.h" + +// +// +// + +void +skc_interop_init(GLFWwindow * * window); + +void +skc_interop_register(skc_context_t context); + +void +skc_interop_poll(GLFWwindow * window, + struct skc_transform_stack * ts); + +void * +skc_interop_get_fb(GLFWwindow * window); + +void +skc_interop_get_dim(uint32_t dim[2]); + +void +skc_interop_blit(GLFWwindow * window); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c new file mode 100644 index 0000000000..65288c3656 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c @@ -0,0 +1,752 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include + +// +// +// + +#include "common/cl/assert_cl.h" + +#include "block.h" +#include "grid.h" +#include "config_cl.h" +#include "runtime_cl_12.h" + +// +// FIXME -- these comments are now quite stale +// +// +// HANDLE/ACQUIRE RELEASE +// +// The runtime vends handles just in case we decide to exploit shared +// virtual memory. But for most platforms and devices we will have a +// pool of host-managed handles and on the device there will be a +// table that maps the host handle to a device-managed memory block. +// +// HANDLE READINESS +// +// A host handle may reference a path or a raster which is not ready +// for use further down the pipeline because it hasn't yet been +// processed by the device. +// +// The simplest scheme for providing every handle a readiness state is +// to build a map that that marks a new handle as being not-ready +// while being processed by a particular grid id. When the final +// sub-pipeline grid responsible for the path or raster is complete, +// then mark the handle as being ready and eventually return the grid +// id back to the pool. This can be performed on a separate thread. +// +// The side-benefit of this approach is that a handle's reference +// count integral type can spare some bits for its associated grid id. +// +// A more memory-intensive approach uses a 64-bit epoch+grid key and +// relies on the ~56 bits of epoch space to avoid any post +// sub-pipeline status update by assuming that a handle and grid will +// match or mismatch when queried. +// + +#define SKC_HANDLE_REFCNT_HOST_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,h) * 8) +#define SKC_HANDLE_REFCNT_DEVICE_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,d) * 8) + +#define SKC_HANDLE_REFCNT_HOST_MAX SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_HOST_BITS) +#define SKC_HANDLE_REFCNT_DEVICE_MAX SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_DEVICE_BITS) + +// +// +// + +static +void +skc_handle_reclaim_create(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool, + skc_handle_reclaim_type_e const reclaim_type, + skc_device_kernel_id const kernel_id) +{ + struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; + + // init counters + reclaim->bih.rem = 0; + + // acquire kernel + reclaim->kernel = skc_device_acquire_kernel(runtime->device,kernel_id); + reclaim->kernel_id = kernel_id; + + // set default args + cl(SetKernelArg(reclaim->kernel,0,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(reclaim->kernel,1,SKC_CL_ARG(runtime->block_pool.blocks.drw))); + cl(SetKernelArg(reclaim->kernel,2,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(reclaim->kernel,3,SKC_CL_ARG(runtime->config->block_pool.ring_mask))); + cl(SetKernelArg(reclaim->kernel,4,SKC_CL_ARG(runtime->handle_pool.map.drw))); +} + +static +void +skc_handle_reclaim_dispose(struct skc_runtime * const runtime, + skc_handle_reclaim_type_e const reclaim_type) +{ + struct skc_handle_reclaim * const reclaim = runtime->handle_pool.reclaim + reclaim_type; + + cl(ReleaseKernel(reclaim->kernel)); +} + +// +// +// + +#define SKC_HANDLE_POOL_BLOCKS_PAD 8 + +void +skc_handle_pool_create(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool, + skc_uint const size, + skc_uint const width, + skc_uint const recs) +{ + skc_uint const blocks = (size + width - 1) / width; + skc_uint const blocks_padded = blocks + SKC_HANDLE_POOL_BLOCKS_PAD; + skc_uint const handles = blocks * width; + skc_uint const handles_padded = blocks_padded * width; + skc_uint const recs_padded = recs + 2; // one for pointer and one for head node + + skc_extent_pdrw_alloc(runtime,&handle_pool->map,handles * sizeof(skc_block_id_t)); + + handle_pool->handle.indices = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles_padded * sizeof(*handle_pool->handle.indices)); + handle_pool->handle.refcnts = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles * sizeof(*handle_pool->handle.refcnts)); + handle_pool->block.indices = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,blocks_padded * sizeof(*handle_pool->block.indices)); + handle_pool->recs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,recs_padded * sizeof(*handle_pool->recs)); + + // initialize handles and refcnts + for (skc_uint ii=0; iihandle.indices[ii] = ii; + + for (skc_uint ii=0; iihandle.refcnts[ii].hd = 0; + + handle_pool->handle.count = handles; + + // initialize block accounting + for (skc_uint ii=0; iiblock.indices[ii] = ii; + + handle_pool->block.count = blocks_padded; + handle_pool->block.width = width; + + handle_pool->block.tos = blocks; // pop = pre-decrement / push = post-increment + handle_pool->block.bos = blocks; // pop = post-increment / push = pre-decrement + + // initialize recs -- first two elements are interpreted differently + handle_pool->recs[0].runtime = runtime; + handle_pool->recs[1] = (union skc_handle_reclaim_rec){ .rem = recs, .head = 2 }; + + for (skc_uint ii=2; iirecs[ii] = (union skc_handle_reclaim_rec){ .index = ii, .next = ii+1 }; + + handle_pool->recs[recs_padded-1].next = SKC_UINT_MAX; + + // initialize acquire + handle_pool->acquire.rem = 0; + + // create reclaimers + skc_handle_reclaim_create(runtime, + handle_pool, + SKC_HANDLE_RECLAIM_TYPE_PATH, + SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM); + + skc_handle_reclaim_create(runtime, + handle_pool, + SKC_HANDLE_RECLAIM_TYPE_RASTER, + SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM); +} + +// +// +// + +void +skc_handle_pool_dispose(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool) +{ + skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER); + skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH); + + skc_runtime_host_perm_free(runtime,handle_pool->recs); + skc_runtime_host_perm_free(runtime,handle_pool->block.indices); + skc_runtime_host_perm_free(runtime,handle_pool->handle.refcnts); + skc_runtime_host_perm_free(runtime,handle_pool->handle.indices); + + skc_extent_pdrw_free(runtime,&handle_pool->map); +} + +// +// +// + +static +skc_uint +skc_handle_pool_block_readable_pop(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool) +{ + SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.tos == 0); + + skc_uint const index = handle_pool->block.indices[--handle_pool->block.tos]; + +#if 0 + skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width; + for (skc_uint ii=0; iiblock.width; ii++) + printf("R-: %u\n",*--handles); +#endif + + return index; +} + +static +void +skc_handle_pool_block_readable_push(struct skc_handle_pool * const handle_pool, + skc_uint const index) +{ + handle_pool->block.indices[handle_pool->block.tos++] = index; + +#if 0 + skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width; + for (skc_uint ii=0; iiblock.width; ii++) + printf("R+: %u\n",*--handles); +#endif +} + + +static +skc_uint +skc_handle_pool_block_writable_pop(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool) +{ + SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.bos == handle_pool->block.count); + + return handle_pool->block.indices[handle_pool->block.bos++]; +} + +static +void +skc_handle_pool_block_writable_push(struct skc_handle_pool * const handle_pool, + skc_uint const block_idx) +{ + handle_pool->block.indices[--handle_pool->block.bos] = block_idx; +} + +// +// May need to acquire the path or raster handle *early* just to be +// sure one exists +// + +skc_handle_t +skc_runtime_handle_device_acquire(struct skc_runtime * const runtime) +{ + struct skc_handle_pool * const handle_pool = &runtime->handle_pool; + + // acquire a block of handles at a time + if (handle_pool->acquire.rem == 0) + { + skc_uint const block_idx = skc_handle_pool_block_readable_pop(runtime,handle_pool); + + handle_pool->acquire.block = block_idx; + handle_pool->acquire.rem = handle_pool->block.width; + handle_pool->acquire.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width; + } + + // load handle from next block slot + skc_uint const rem = --handle_pool->acquire.rem; + skc_handle_t const handle = *--handle_pool->acquire.handles; + + // initialize refcnt for handle + handle_pool->handle.refcnts[handle] = (union skc_handle_refcnt){ .h = 1, .d = 1 }; + + // if this was the last handle in the block then move the block id + // to the reclamation stack to be used as a scratchpad + if (rem == 0) { + skc_handle_pool_block_writable_push(handle_pool,handle_pool->acquire.block); + } + + return handle; +} + +// +// +// + +static +void +skc_handle_reclaim_completion(union skc_handle_reclaim_rec * const recN) +{ + // get root rec which contains pointer to runtime + union skc_handle_reclaim_rec * const rec0 = recN - recN->index; + union skc_handle_reclaim_rec * const rec1 = rec0 + 1; + + // return block for reading + skc_handle_pool_block_readable_push(&rec0->runtime->handle_pool,recN->block); + + // recN is new head of list + recN->next = rec1->head; + rec1->head = recN->index; + rec1->rem += 1; +} + +static +void +skc_handle_reclaim_cb(cl_event event, cl_int status, union skc_handle_reclaim_rec * const recN) +{ + SKC_CL_CB(status); + + union skc_handle_reclaim_rec * const rec0 = recN - recN->index; + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(rec0->runtime->scheduler,skc_handle_reclaim_completion,recN); +} + +// +// FIXME -- is there an issue launching on the host thread? +// + +static +void +skc_handle_reclaim_launch(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool, + struct skc_handle_reclaim * const reclaim, + union skc_handle_reclaim_rec * const recN) +{ + cl(SetKernelArg(reclaim->kernel, + 5, + handle_pool->block.width * sizeof(skc_handle_t), + reclaim->bih.handles)); + + // acquire a cq + cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); + + cl_event complete; + + // the kernel grid is shaped by the target device + skc_device_enqueue_kernel(runtime->device, + reclaim->kernel_id, + cq, + reclaim->kernel, + handle_pool->block.width, + 0,NULL,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_handle_reclaim_cb,recN)); + cl(ReleaseEvent(complete)); + + // kickstart kernel execution + cl(Flush(cq)); + + // release the cq + skc_runtime_release_cq_in_order(runtime,cq); +} + +// +// reclaim a handle +// + +static +union skc_handle_reclaim_rec * +skc_handle_acquire_reclaim_rec(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool) +{ + union skc_handle_reclaim_rec * const rec1 = handle_pool->recs + 1; + + SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,rec1->rem == 0); + + union skc_handle_reclaim_rec * const recN = handle_pool->recs + rec1->head; + + rec1->head = recN->next; + rec1->rem -= 1; + + // fprintf(stderr,"rec1->rem = %u\n",rec1->rem); + + return recN; +} + +static +void +skc_runtime_device_reclaim(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool, + struct skc_handle_reclaim * const reclaim, + skc_handle_t const handle) +{ + // grab a new block? + if (reclaim->bih.rem == 0) + { + skc_uint const block_idx = skc_handle_pool_block_writable_pop(runtime,handle_pool); + + reclaim->bih.block = block_idx; + reclaim->bih.rem = handle_pool->block.width; + reclaim->bih.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width; + } + + // store handle -- handle's refcnt was already set to {0:0} + *--reclaim->bih.handles = handle; + + // if block is full then launch reclamation kernel + if (--reclaim->bih.rem == 0) + { + union skc_handle_reclaim_rec * recN = skc_handle_acquire_reclaim_rec(runtime,handle_pool); + + recN->block = reclaim->bih.block; + + skc_handle_reclaim_launch(runtime,handle_pool,reclaim,recN); + } +} + +// +// Validate host-provided handles before retaining. +// +// Retain validation consists of: +// +// - correct handle type +// - handle is in range of pool +// - host refcnt is not zero +// - host refcnt is not at the maximum value +// +// After validation, retain the handles for the host +// + +static +skc_err +skc_runtime_handle_host_validated_retain(struct skc_runtime * const runtime, + skc_typed_handle_type_e const handle_type, + skc_typed_handle_t const * const typed_handles, + uint32_t const count) +{ + // + // FIXME -- test to make sure handles aren't completely out of range integers + // + + union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts; + + for (skc_uint ii=0; ii= runtime->handle_pool.handle.count) + { + return SKC_ERR_HANDLE_INVALID; + } + else + { + union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; + skc_uint const host = refcnt_ptr->h; + + if (host == 0) + { + return SKC_ERR_HANDLE_INVALID; + } + else if (host == SKC_HANDLE_REFCNT_HOST_MAX) + { + return SKC_ERR_HANDLE_OVERFLOW; + } + } + } + } + + // + // all the handles validated, so retain them all.. + // + for (skc_uint ii=0; iideps,rasters,count); + + return SKC_ERR_SUCCESS; +} + +skc_err +skc_runtime_path_host_flush(struct skc_runtime * const runtime, + skc_path_t const * paths, + uint32_t count) +{ + skc_grid_deps_force(runtime->deps,paths,count); + + return SKC_ERR_SUCCESS; +} + +// +// Validate host-provided handles before releasing. +// +// Release validation consists of: +// +// - correct handle type +// - handle is in range of pool +// - host refcnt is not zero +// +// After validation, release the handles for the host +// + +static +skc_err +skc_runtime_host_validated_release(struct skc_runtime * const runtime, + skc_typed_handle_type_e const type, + skc_handle_reclaim_type_e const reclaim_type, + skc_typed_handle_t const * const handles, + uint32_t const count) +{ + struct skc_handle_pool * const handle_pool = &runtime->handle_pool; + union skc_handle_refcnt * const refcnts = handle_pool->handle.refcnts; + + for (skc_uint ii=0; ii= handle_pool->handle.count) + { + return SKC_ERR_HANDLE_INVALID; + } + else + { + union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; + skc_uint const host = refcnt_ptr->h; + + if (host == 0) + { + return SKC_ERR_HANDLE_INVALID; + } + } + } + } + + // + // all the handles validated, so release them all.. + // + struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; + + for (skc_uint ii=0; iihandle_pool.handle.refcnts; + + while (count-- > 0) + { + skc_typed_handle_t const typed_handle = *handles++; + + if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type)) + { + return SKC_ERR_HANDLE_INVALID; + } + else + { + skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle); + + if (handle >= runtime->handle_pool.handle.count) + { + return SKC_ERR_HANDLE_INVALID; + } + else + { + union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; + union skc_handle_refcnt refcnt = *refcnt_ptr; + + if (refcnt.h == 0) + { + return SKC_ERR_HANDLE_INVALID; + } + else if (refcnt.d == SKC_HANDLE_REFCNT_DEVICE_MAX) + { + return SKC_ERR_HANDLE_OVERFLOW; + } + } + } + } + + return SKC_ERR_SUCCESS; +} + +// +// After validation, retain the handles for the device +// + +void +skc_runtime_handle_device_retain(struct skc_runtime * const runtime, + skc_handle_t const * handles, + uint32_t count) +{ + union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts; + + while (count-- > 0) + refcnts[SKC_TYPED_HANDLE_TO_HANDLE(*handles++)].d++; +} + +// +// Release the device-held handles -- no validation required! +// + +static +void +skc_runtime_handle_device_release(struct skc_runtime * const runtime, + skc_handle_reclaim_type_e const reclaim_type, + skc_handle_t const * handles, + skc_uint count) +{ + struct skc_handle_pool * const handle_pool = &runtime->handle_pool; + union skc_handle_refcnt * const refcnts = handle_pool->handle.refcnts; + struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type; + + while (count-- > 0) { + skc_handle_t const handle = *handles++; + union skc_handle_refcnt * const refcnt_ptr = refcnts + handle; + union skc_handle_refcnt refcnt = *refcnt_ptr; + + refcnt.d -= 1; + *refcnt_ptr = refcnt; + +#if 0 + printf("%8u = { %u, %u }\n",handle,refcnt.h,refcnt.d); +#endif + + if (refcnt.hd == 0) { + skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle); + } + } +} + +// +// +// + +void +skc_runtime_path_device_release(struct skc_runtime * const runtime, + skc_handle_t const * handles, + skc_uint count) +{ + skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH,handles,count); +} + +void +skc_runtime_raster_device_release(struct skc_runtime * const runtime, + skc_handle_t const * handles, + skc_uint count) +{ + skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER,handles,count); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h new file mode 100644 index 0000000000..4fefae3552 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h @@ -0,0 +1,177 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "macros.h" +#include "handle.h" +#include "extent_cl_12.h" +#include "device_cl_12.h" + +// +// FIXME -- THIS DOCUMENTATION IS STALE NOW THAT A REFERENCE COUNT REP +// IS A {HOST:DEVICE} PAIR. +// +// Host-side handle pool +// +// The bulk size of the three extents is currently 6 bytes of overhead +// per number of host handles. The number of host handles is usually +// less than the number of blocks in the pool. Note that the maximum +// number of blocks is 2^27. +// +// A practical instantiation might provide a combined 2^20 path and +// raster host handles. This would occupy 6 MB of host RAM for the +// 32-bit handle, 8-bit reference count and 8-bit handle-to-grid map. +// +// Also note that we could use isolated/separate path and raster block +// pools. Worst case, this would double the memory footprint of SKC. +// +// Host-side handle reference count +// +// [0 ] : release +// [1..UMAX] : retain +// +// In a garbage-collected environment we might want to rely on an +// existing mechanism for determing whether a handle is live. +// +// Otherwise, we probably want to have a 16 or 32-bit ref count. +// +// The handle reference count is defensive and will not allow the host +// to underflow a handle that's still retained by the pipeline. +// +// The single reference counter is split into host and device counts. +// + +union skc_handle_refcnt +{ + skc_ushort hd; // host and device + + struct { + skc_uchar h; // host + skc_uchar d; // device + }; +}; + +SKC_STATIC_ASSERT(SKC_MEMBER_SIZE(union skc_handle_refcnt,hd) == + SKC_MEMBER_SIZE(union skc_handle_refcnt,h) + + SKC_MEMBER_SIZE(union skc_handle_refcnt,d)); + +// +// +// + +struct skc_handle_bih +{ + skc_uint block; + skc_uint rem; + skc_handle_t * handles; +}; + +struct skc_handle_reclaim +{ + struct skc_handle_bih bih; + + cl_kernel kernel; + skc_device_kernel_id kernel_id; +}; + +union skc_handle_reclaim_rec +{ + // ELEMENT 0 + struct skc_runtime * runtime; + + // ELEMENT 1 + struct { + skc_uint rem; // # of available records + skc_uint head; // index of first record + }; + + // ELEMENTS 2+ + struct { + skc_uint index; // index of this record -- never modified + union { + skc_uint next; // index of next record + skc_uint block; // block index of reclaimed handles + }; + }; +}; + +SKC_STATIC_ASSERT(sizeof(union skc_handle_reclaim_rec) == sizeof(skc_uint2)); + +// +// +// + +typedef enum skc_handle_reclaim_type_e { + + SKC_HANDLE_RECLAIM_TYPE_PATH, + SKC_HANDLE_RECLAIM_TYPE_RASTER, + + SKC_HANDLE_RECLAIM_TYPE_COUNT + +} skc_handle_reclaim_type_e; + +struct skc_handle_pool +{ + // + // FIXME -- should we be pedantic and make these always-host-side + // allocations "extents" as well? I think it's OK not being an + // extent structure for now and is mostly consistent with the rest + // of the code. + // + // FIXME -- the cbs[] array is a little idiosyncratic but the intent + // is to avoid storing the 64-bit backpointer inside of every single + // record. This can be harmonized later. Note that only a few + // hundred outstanding callbacks would represent many many subgroups + // of work and would fully occupy the GPU (if we allow it). + // + // + struct skc_extent_pdrw map; // device-managed extent mapping a host handle to device block id + + struct { + skc_handle_t * indices; // array of individual host handles -- fragmented into blocks + union skc_handle_refcnt * refcnts; // array of reference counts indexed by an individual handle + skc_uint count; + } handle; + + struct { + skc_uint * indices; // stack of indices to fixed-size blocks of host handles + skc_uint count; // number of handles -- valid from [0,size) + skc_uint width; // width of a fixed-size block of handles + skc_uint tos; // grows upward / push++ / --pop / # fixed-size blocks for reading + skc_uint bos; // grows downward / --push / pop++ / # fixed-size blocks for writing + } block; + + union skc_handle_reclaim_rec * recs; // array of reclaim records + + struct skc_handle_bih acquire; + struct skc_handle_reclaim reclaim[SKC_HANDLE_RECLAIM_TYPE_COUNT]; +}; + +// +// +// + +void +skc_handle_pool_create(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool, + skc_uint const size, + skc_uint const width, + skc_uint const recs); + +void +skc_handle_pool_dispose(struct skc_runtime * const runtime, + struct skc_handle_pool * const handle_pool); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl new file mode 100644 index 0000000000..726b0a7907 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl @@ -0,0 +1,64 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "device_cl_12.h" + +// +// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ +// + +__kernel +SKC_BP_INIT_IDS_KERNEL_ATTRIBS +void +skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size) +{ + uint const gid = get_global_id(0); + + // + // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to + // accomplish this with fewer threads and using either IPC and/or + // vector stores -- it should be on certain architectures! + // + + // + // initialize pool with sequence + // + if (gid < bp_size) + ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK; +} + +// +// +// + +__kernel +SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS +void +skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size) +{ + // the version test is to squelch a bug with the Intel OpenCL CPU + // compiler declaring it supports the cl_intel_subgroups extension +#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups) + uint const tid = get_sub_group_local_id(); +#else + uint const tid = get_local_id(0); +#endif + + // + // launch two threads and store [ 0, bp_size ] + // + bp_atomics[tid] = tid * bp_size; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h new file mode 100644 index 0000000000..e68579c0f7 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h @@ -0,0 +1,60 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_DEVICE_CL_12_AVX2_H +#define SKC_ONCE_DEVICE_CL_12_AVX2_H + +// +// +// + +#define SKC_DEVICE_BLOCK_WORDS_LOG2 6 +#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2 4 + +// +// +// + +#define SKC_DEVICE_BLOCK_WORDS (1u << SKC_DEVICE_BLOCK_WORDS_LOG2) +#define SKC_DEVICE_SUBBLOCK_WORDS (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +// +// +// + +#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS) + +// +// +// + +#define SKC_COPY_PATHS_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS +#define SKC_COPY_PATHS_ELEM_WORDS 1 + +// +// +// + +#define SKC_EXPAND_FILLS_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS +#define SKC_EXPAND_FILLS_ELEM_WORDS 1 + +// +// +// + +#define SKC_RASTERIZE_THREADS_PER_BLOCK SKC_DEVICE_SUBBLOCK_WORDS + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c new file mode 100644 index 0000000000..aebe8fdc1d --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c @@ -0,0 +1,938 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include +#include + +#include "common/cl/assert_cl.h" + +#include "tile.h" +#include "raster.h" +#include "macros.h" + +#include "config_cl.h" +#include "runtime_cl_12.h" + +#include "device_cl_12.h" + +#include "hs/cl/hs_cl_launcher.h" +#include "hs/cl/gen9/hs_cl.h" + +// +// +// + +#define SKC_KERNEL_SPIRV 0 +#define SKC_KERNEL_BINARY 1 +#define SKC_KERNEL_SRC 0 + +// +// +// + +#if SKC_KERNEL_SPIRV + +#include "inl/block_pool_init.pre.spv.inl" +#include "inl/paths_copy.pre.spv.inl" +#include "inl/fills_expand.pre.spv.inl" +#include "inl/rasterize.pre.spv.inl" +#include "inl/segment_ttrk.pre.spv.inl" +#include "inl/rasters_alloc.pre.spv.inl" +#include "inl/prefix.pre.spv.inl" +#include "inl/place.pre.spv.inl" +#include "inl/segment_ttck.pre.spv.inl" +#include "inl/render.pre.spv.inl" +#include "inl/paths_reclaim.pre.spv.inl" +#include "inl/rasters_reclaim.pre.spv.inl" + +#elif SKC_KERNEL_BINARY + +#include "inl/block_pool_init.pre.bin.inl" +#include "inl/paths_copy.pre.bin.inl" +#include "inl/fills_expand.pre.bin.inl" +#include "inl/rasterize.pre.bin.inl" +#include "inl/segment_ttrk.pre.bin.inl" +#include "inl/rasters_alloc.pre.bin.inl" +#include "inl/prefix.pre.bin.inl" +#include "inl/place.pre.bin.inl" +#include "inl/segment_ttck.pre.bin.inl" +#include "inl/render.pre.bin.inl" +#include "inl/paths_reclaim.pre.bin.inl" +#include "inl/rasters_reclaim.pre.bin.inl" + +#elif SKC_KERNEL_SRC + +#include "inl/block_pool_init.pre.src.inl" +#include "inl/paths_copy.pre.src.inl" +#include "inl/fills_expand.pre.src.inl" +#include "inl/rasterize.pre.src.inl" +#include "inl/segment_ttrk.pre.src.inl" +#include "inl/rasters_alloc.pre.src.inl" +#include "inl/prefix.pre.src.inl" +#include "inl/place.pre.src.inl" +#include "inl/segment_ttck.pre.src.inl" +#include "inl/render.pre.src.inl" +#include "inl/paths_reclaim.pre.src.inl" +#include "inl/rasters_reclaim.pre.src.inl" + +#endif + +// +// FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY +// + +static +struct skc_config const config = + { + .suballocator = { + .host = { + .size = 1024 * 1024, // words + .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) + }, + .device = { + .size = 128 * 1024 * 1024, + .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t))) + } + }, + + .scheduler = { + .size = 4096 // 128 // fixme -- this is just for testing -- too big + }, + + .subblock = { + .words = SKC_DEVICE_SUBBLOCK_WORDS, // words per subblock -- pow2 + .bytes = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint) // bytes per subblock -- pow2 + }, + + .block = { + .words = SKC_DEVICE_BLOCK_WORDS, // words per block -- pow2 + .bytes = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint), // bytes per block -- pow2 + .subblocks = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes + }, + + .block_pool = { + .pool_size = 524288, // blocks in pool -- 128 MB + .ring_pow2 = 524288, // blocks in pool rounded up pow2 + .ring_mask = 524288 - 1 + }, + + .cq_pool = { +#ifndef NDEBUG + .type = SKC_CQ_TYPE_IN_ORDER_PROFILING, +#else + .type = 0, +#endif + .size = 8 + }, + + .handle_pool = { + .size = 262144, // large fraction of block pool size (for now, 1:2) + .width = SKC_RECLAIM_ARRAY_SIZE, + .recs = 256 // too many? too few? + }, + + .tile = { + .width = SKC_TILE_WIDTH, // tile width in pixels + .height = SKC_TILE_HEIGHT, // tile height in pixels + .ratio = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB + }, + + .paths_copy = { + + .buffer = { + .count = 16 // # of subbufs in buffer + }, + + .subbuf = { + .count = 1024 // # of blocks/commands in subbuf + }, + + .block = { + .subbuf = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024, // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN + .buffer = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count + }, + + .command = { + .subbuf = sizeof(skc_uint) * 1024, // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN + .buffer = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count + }, + + // skc_uint paths_lowat; + }, + + .raster_cohort = { + .path_ids = { + .elem_count = 8192, + .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER + }, + + .transforms = { + .elem_count = 8192, + .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER + }, + + .clips = { + .elem_count = 8192, + .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER + }, + + .fill = { + .elem_count = 8192, + .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER + }, + + .raster_ids = { + .elem_count = 8192, + .snap_count = (1<device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err); + + // + // release program now + // + // FIXME -- if/when we multithread then we need to clone kernels + // (>=2.1) or keep programs around (<=2.0) + // + + // get workgroup size + cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], + runtime->cl.device_id, + CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + sizeof(runtime->device->reqd_szs[0]), + runtime->device->reqd_szs[id], + NULL)); + + // + // GEN9+ PROBING + // +#define SKC_TARGET_GEN9 +#ifdef SKC_TARGET_GEN9 + +#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 +#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 +#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A + + cl_ulong spill_mem_size; + + cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], + runtime->cl.device_id, + CL_KERNEL_SPILL_MEM_SIZE_INTEL, + sizeof(spill_mem_size), + &spill_mem_size, + NULL)); + + fprintf(stderr,"\t\tspill mem size: %lu bytes\n", + (unsigned long)spill_mem_size); + + cl_ulong local_mem_size; + + cl(GetKernelWorkGroupInfo(runtime->device->kernels[id], + runtime->cl.device_id, + CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(local_mem_size), + &local_mem_size, + NULL)); + + fprintf(stderr,"\t\tlocal mem size: %lu bytes\n", + (unsigned long)local_mem_size); +#endif + } +} + +static +void +skc_device_build_program(struct skc_runtime * const runtime, + struct skc_program_source const * const source, + struct skc_program_kernel const * const kernels, + skc_uint const kernel_count) +{ + cl_program program; + + fprintf(stderr,"%-20s: ",source->name); + + cl_int cl_err; + +#if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V + + fprintf(stderr,"Creating (SPIR-V) ... "); + + program = clCreateProgramWithIL(runtime->cl.context, + source->src, + source->srclen, + &cl_err); + +#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY + + fprintf(stderr,"Creating (Binary) ... "); + + cl_int status; + program = clCreateProgramWithBinary(runtime->cl.context, + 1, + &runtime->cl.device_id, + &source->srclen, + (unsigned char const *[]){ source->src }, + &status, + &cl_err); + +#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE + + fprintf(stderr,"Creating (Source) ... "); + + program = clCreateProgramWithSource(runtime->cl.context, + 1, + (char const *[]){ source->src }, + &source->srclen, + &cl_err); +#else + +#error "SKC_KERNEL_???" + +#endif + + cl_ok(cl_err); + + fprintf(stderr,"Building ... "); + + // build the program + cl(BuildProgram(program, + 1, + &runtime->cl.device_id, + source->options, // build options are ignored by binary + NULL, + NULL)); + + fprintf(stderr,"Done\n"); + + // build the kernels + skc_device_create_kernels(runtime,kernels,kernel_count,program); + + // we're done with program for now + // can always recover it from a kernel instance + cl(ReleaseProgram(program)); +} + +// +// RELEASE KERNELS +// + +static +void +skc_device_release_kernels(struct skc_device * const device) +{ + for (skc_int ii=0; iikernels); ii++) + cl(ReleaseKernel(device->kernels[ii])); +} + + + +cl_kernel +skc_device_acquire_kernel(struct skc_device * const device, + skc_device_kernel_id const type) +{ + cl_kernel kernel = device->kernels[type]; + + cl(RetainKernel(kernel)); + + return kernel; +} + +// +// INITIALIZE KERNEL ARGS +// +// FIXME +// +// pre-assign any kernel arguments that are never going to change -- +// for example, the block pool +// + +// +// +// + +#define SKC_DEVICE_BUILD_PROGRAM(p) \ + skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p)) + + +void +skc_device_create(struct skc_runtime * const runtime) +{ + struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device)); + + // hang device off of runtime + runtime->device = device; + + // hang config off of runtime + runtime->config = &config; + + // create kernels + SKC_DEVICE_BUILD_PROGRAM(block_pool_init); + SKC_DEVICE_BUILD_PROGRAM(paths_copy); + SKC_DEVICE_BUILD_PROGRAM(fills_expand); + SKC_DEVICE_BUILD_PROGRAM(rasterize); + SKC_DEVICE_BUILD_PROGRAM(segment_ttrk); + SKC_DEVICE_BUILD_PROGRAM(rasters_alloc); + SKC_DEVICE_BUILD_PROGRAM(prefix); + SKC_DEVICE_BUILD_PROGRAM(place); + SKC_DEVICE_BUILD_PROGRAM(segment_ttck); + SKC_DEVICE_BUILD_PROGRAM(render); + SKC_DEVICE_BUILD_PROGRAM(paths_reclaim); + SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim); + + // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up + hs_create(runtime->cl.context,runtime->cl.device_id,NULL); +} + +void +skc_device_dispose(struct skc_runtime * const runtime) +{ + // + // FIXME -- dispose of programs, kernels, etc. + // + + skc_runtime_host_perm_free(runtime,runtime->device); +} + +// +// FIXME -- just pass the device type +// + +void +skc_device_enqueue_kernel(struct skc_device * const device, + skc_device_kernel_id const type, + cl_command_queue cq, + cl_kernel kernel, + size_t const work_size, + cl_uint num_events_in_wait_list, + cl_event const * const event_wait_list, + cl_event * const event) +{ + if (work_size == 0) + return; + + cl_uint work_dim [1]; + size_t work_global[3]; + size_t work_local [3]; + + size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size, + work_dim, + work_global, + work_local); + cl(EnqueueNDRangeKernel(cq, + kernel,// device->kernels[type], + work_dim[0], + NULL, + work_global, + work_local_ptr, + num_events_in_wait_list, + event_wait_list, + event)); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h new file mode 100644 index 0000000000..0cac2261e7 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h @@ -0,0 +1,341 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_DEVICE_CL_12_H +#define SKC_ONCE_DEVICE_CL_12_H + +// +// FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE +// +// THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED +// + +#include "block.h" + +// +// +// + +#include + +// +// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES: +// +// 1) The subblock size should match the natural SIMT/SIMD width of +// the target device. +// +// 2) Either a square or rectangular (1:2) tile size is chosen. The +// tile size is usually determined by the amount of SMEM available +// to a render kernel subgroup and desired multiprocessor +// occupancy. +// +// 3) If the tile is rectangular then the block size must be at least +// twice the size of the subblock size. +// +// 4) A large block size can decrease allocation overhead but there +// will be diminishing returns as the block size increases. +// + +#define SKC_DEVICE_BLOCK_WORDS_LOG2 6 // CHANGE "WORDS" TO "SIZE" ? +#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2 3 + +#define SKC_TILE_WIDTH_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 +#define SKC_TILE_HEIGHT_LOG2 (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1) + +///////////////////////////////////////////////////////////////// +// +// BLOCK POOL INIT +// + +#define SKC_BP_INIT_IDS_KERNEL_ATTRIBS +#define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(2,1,1))) + +///////////////////////////////////////////////////////////////// +// +// PATHS ALLOC +// + +#define SKC_PATHS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(1,1,1))) + +///////////////////////////////////////////////////////////////// +// +// PATHS COPY +// + +#define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? +#define SKC_PATHS_COPY_ELEM_WORDS 1 +#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_1() + +#define SKC_PATHS_COPY_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE))) + +#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS) + +typedef skc_uint skc_paths_copy_elem; +typedef skc_uint skc_pb_idx_v; + +///////////////////////////////////////////////////////////////// +// +// FILLS EXPAND +// + +#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 +#define SKC_FILLS_EXPAND_ELEM_WORDS 1 + +#define SKC_FILLS_EXPAND_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE))) + +///////////////////////////////////////////////////////////////// +// +// RASTER ALLOC +// +// NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is +// why use of the subgroup broadcast produces a compiler error. So a +// subgroup of size 16 is this widest we can require. +// + +#define SKC_RASTERS_ALLOC_GROUP_SIZE 16 + +#if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16) + +#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE))) +#define SKC_RASTERS_ALLOC_LOCAL_ID() get_sub_group_local_id() +#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) sub_group_scan_inclusive_add(v) +#define SKC_RASTERS_ALLOC_BROADCAST(v,i) sub_group_broadcast(v,i) + +#else + +#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1))) +#define SKC_RASTERS_ALLOC_LOCAL_ID() get_local_id(0) +#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) work_group_scan_inclusive_add(v) +#define SKC_RASTERS_ALLOC_BROADCAST(v,i) work_group_broadcast(v,i) + +#endif + +///////////////////////////////////////////////////////////////// +// +// RASTERIZE +// + +#define SKC_RASTERIZE_SUBGROUP_SIZE SKC_DEVICE_SUBBLOCK_WORDS +#define SKC_RASTERIZE_VECTOR_SIZE_LOG2 0 +#define SKC_RASTERIZE_WORKGROUP_SUBGROUPS 1 + +#define SKC_RASTERIZE_KERNEL_ATTRIBS \ + __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \ + __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1))) + +#define SKC_RASTERIZE_FLOAT float +#define SKC_RASTERIZE_UINT uint +#define SKC_RASTERIZE_INT int +#define SKC_RASTERIZE_PREDICATE bool +#define SKC_RASTERIZE_POOL uint + +#define SKC_RASTERIZE_TILE_HASH_X_BITS 1 +#define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 + +typedef skc_block_id_t skc_block_id_v_t; +typedef skc_uint2 skc_ttsk_v_t; +typedef skc_uint2 skc_ttsk_s_t; + +// SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE); + +///////////////////////////////////////////////////////////////// +// +// PREFIX +// + +#define SKC_PREFIX_SUBGROUP_SIZE 8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS +#define SKC_PREFIX_WORKGROUP_SUBGROUPS 1 + +#define SKC_PREFIX_KERNEL_ATTRIBS \ + __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE))) \ + __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1))) + +#define SKC_PREFIX_TTP_V skc_uint2 +#define SKC_PREFIX_TTS_V_BITFIELD skc_int + +#define SKC_PREFIX_TTS_VECTOR_INT_EXPAND SKC_EXPAND_1 + +#define SKC_PREFIX_SMEM_ZERO ulong +#define SKC_PREFIX_SMEM_ZERO_WIDTH (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t)) +#define SKC_PREFIX_SMEM_COUNT_BLOCK_ID 8 + +#define SKC_PREFIX_BLOCK_ID_V_SIZE SKC_PREFIX_SUBGROUP_SIZE + +#define SKC_PREFIX_TTXK_V_SIZE SKC_PREFIX_SUBGROUP_SIZE +#define SKC_PREFIX_TTXK_V_MASK (SKC_PREFIX_TTXK_V_SIZE - 1) + +typedef skc_uint skc_bp_elem_t; + +typedef skc_uint2 skc_ttrk_e_t; +typedef skc_uint2 skc_ttsk_v_t; +typedef skc_uint2 skc_ttsk_s_t; +typedef skc_uint2 skc_ttpk_s_t; +typedef skc_uint2 skc_ttxk_v_t; + +typedef skc_int skc_tts_v_t; + +typedef skc_int skc_ttp_t; + +typedef skc_uint skc_raster_yx_s; + +typedef skc_block_id_t skc_block_id_v_t; +typedef skc_block_id_t skc_block_id_s_t; + +///////////////////////////////////////////////////////////////// +// +// PLACE +// + +#define SKC_PLACE_SUBGROUP_SIZE 16 +#define SKC_PLACE_WORKGROUP_SUBGROUPS 1 + +#define SKC_PLACE_KERNEL_ATTRIBS \ + __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE))) \ + __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1))) + +typedef skc_uint skc_bp_elem_t; + +typedef skc_uint skc_ttsk_lo_t; +typedef skc_uint skc_ttsk_hi_t; + +typedef skc_uint skc_ttpk_lo_t; +typedef skc_uint skc_ttpk_hi_t; + +typedef skc_uint skc_ttxk_lo_t; +typedef skc_uint skc_ttxk_hi_t; + +typedef skc_uint2 skc_ttck_t; + +typedef skc_bool skc_pred_v_t; +typedef skc_int skc_int_v_t; + +///////////////////////////////////////////////////////////////// +// +// RENDER +// + +#define SKC_ARCH_GEN9 + +#if defined(__OPENCL_C_VERSION__) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif + +#define SKC_RENDER_SUBGROUP_SIZE 8 +#define SKC_RENDER_WORKGROUP_SUBGROUPS 1 + +#define SKC_RENDER_KERNEL_ATTRIBS \ + __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE))) \ + __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1))) + +#define SKC_RENDER_SCANLINE_VECTOR_SIZE 2 + +#define SKC_RENDER_REGS_COLOR_R 2 +#define SKC_RENDER_REGS_COVER_R 3 + +#define SKC_RENDER_TTSB_EXPAND() SKC_EXPAND_1() + +#define SKC_RENDER_TTS_V skc_int +#define SKC_RENDER_TTS_V_BITFIELD skc_int + +#define SKC_RENDER_TTP_V skc_int2 +#define SKC_RENDER_AREA_V skc_int2 + +#define SKC_RENDER_TILE_COLOR_PAIR half2 +#define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v) vload2(x,v) + +#define SKC_RENDER_SURFACE_COLOR half4 +#define SKC_RENDER_SURFACE_WRITE write_imageh + +// #define SKC_RENDER_TTXB_VECTOR_INT int2 +// #define SKC_RENDER_TTXB_VECTOR_UINT uint2 + +#define SKC_RENDER_WIDE_AA ulong // SLM = 64 bytes/clock + +#define SKC_RENDER_TILE_COLOR half2 +#define SKC_RENDER_TILE_COVER half2 + +#define SKC_RENDER_ACC_COVER_INT int2 +#define SKC_RENDER_ACC_COVER_UINT uint2 + +#define SKC_RENDER_GRADIENT_FLOAT float2 +#define SKC_RENDER_GRADIENT_INT int2 +#define SKC_RENDER_GRADIENT_STOP int2 +#define SKC_RENDER_GRADIENT_FRAC half2 +#define SKC_RENDER_GRADIENT_COLOR_STOP half + +#define SKC_RENDER_SURFACE_U8_RGBA uint2 + +#define SKC_RENDER_TILE_COLOR_VECTOR uint16 +#define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint +#define SKC_RENDER_TILE_COLOR_VECTOR_COUNT ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR)) + +///////////////////////////////////////////////////////////////// +// +// PATHS & RASTERS RECLAIM +// +// FIXME -- investigate enabling the stride option for a smaller grid +// that iterates over a fixed number of threads. Since reclamation is +// a low-priority task, it's probably reasonable to trade longer +// reclamation times for lower occupancy of the device because it +// might delay the fastpath of the pipeline. +// + +#define SKC_RECLAIM_ARRAY_SIZE (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice + +///////////////////////////////////////////////////////////////// +// +// PATHS RECLAIM +// + +#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? +#define SKC_PATHS_RECLAIM_LOCAL_ELEMS 1 +#define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE))) + +///////////////////////////////////////////////////////////////// +// +// RASTERS RECLAIM +// + +#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? +#define SKC_RASTERS_RECLAIM_LOCAL_ELEMS 1 +#define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE))) + +// +// COMMON -- FIXME -- HOIST THESE ELSEWHERE +// + +#define SKC_DEVICE_BLOCK_WORDS (1u << SKC_DEVICE_BLOCK_WORDS_LOG2) +#define SKC_DEVICE_SUBBLOCK_WORDS (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +#define SKC_DEVICE_BLOCK_DWORDS (SKC_DEVICE_BLOCK_WORDS / 2) + +#define SKC_DEVICE_BLOCK_WORDS_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2) +#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS) + +#define SKC_TILE_RATIO (SKC_TILE_HEIGHT / SKC_TILE_WIDTH) + +// +// +// + +#define SKC_PATHS_COPY_SUBGROUP_SIZE (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2) +#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2) +#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2) +#define SKC_FILLS_EXPAND_SUBGROUP_SIZE (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2) + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat new file mode 100644 index 0000000000..3631271d9b --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat @@ -0,0 +1,15 @@ +@ECHO OFF + +CMD /C make_inl_cl.bat ..\..\..\block_pool_init.cl +CMD /C make_inl_cl.bat ..\..\..\fills_expand.cl +CMD /C make_inl_cl.bat ..\..\..\paths_copy.cl +CMD /C make_inl_cl.bat ..\..\..\rasterize.cl +CMD /C make_inl_cl.bat ..\..\..\segment_ttrk.cl +CMD /C make_inl_cl.bat ..\..\..\rasters_alloc.cl +CMD /C make_inl_cl.bat ..\..\..\prefix.cl +CMD /C make_inl_cl.bat ..\..\..\place.cl +CMD /C make_inl_cl.bat ..\..\..\segment_ttck.cl +CMD /C make_inl_cl.bat ..\..\..\render.cl +CMD /C make_inl_cl.bat ..\..\..\paths_reclaim.cl +CMD /C make_inl_cl.bat ..\..\..\rasters_reclaim.cl + diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat new file mode 100644 index 0000000000..e3b0b37651 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat @@ -0,0 +1,85 @@ +@ECHO OFF + +:: +:: TARGET OPENCL 1.2 +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: OPENCL_STD=-cl-std=CL2.0 +:: OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +REM SET PRE_DIR=%~p1 +REM CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: +:: + +SET DIR_CL12="%INTELOCLSDKROOT%include" +SET DIR_COMPUTE=..\..\..\..\..\..\.. +SET DIR_SKC=%DIR_COMPUTE%\skc +SET DIR_PLATFORM=%DIR_SKC%\platforms\cl_12 +SET DIR_DEVICE=.. + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C cl -I %DIR_CL12% -I %DIR_DEVICE% -I %DIR_PLATFORM% -I %DIR_SKC% -I %DIR_COMPUTE% -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl new file mode 100644 index 0000000000..39fee75f3d --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl @@ -0,0 +1,309 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "block.h" +#include "path.h" +#include "common.h" +#include "atomic_cl.h" +#include "raster_builder_cl_12.h" +#include "device_cl_12.h" + +// +// +// + +#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) + +#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) +#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) + +#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// +// + +#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// +// + +#if ( SKC_FILLS_EXPAND_X == 1 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_FILLS_EXPAND_X == 2 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_FILLS_EXPAND_X == 4 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_FILLS_EXPAND_X == 8 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_FILLS_EXPAND_X == 16) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_FILLS_EXPAND_X" +#endif + +// +// Fill and rasterize cmds only differ in their first word semantics +// + +union skc_cmd_expand +{ + union skc_cmd_fill fill; + union skc_cmd_rasterize rasterize; +}; + +// +// +// + +union skc_path_elem +{ + skc_uint u32; + skc_float f32; +}; + +// +// COMPILE-TIME AND RUN-TIME MACROS +// + +#define SKC_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +#define SKC_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// +// + +void +skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out, + skc_uint * const out_idx, + union skc_cmd_expand * const cmd, + union skc_path_elem const e, + skc_uint const e_idx) +{ + // + // FIXME -- we can append a large number of nodeword indices to a + // local SMEM queue and flush when full. It may or may not be a + // performance win on some architectures. + // + skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT; + skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0); + + cmd->rasterize.nodeword = e_idx; + + if (is_elem) { + cmds_out[*out_idx + offset] = cmd->rasterize; + } + + *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1); +} + +// +// +// + +__kernel +SKC_FILLS_EXPAND_KERNEL_ATTRIBS +void +skc_kernel_fills_expand(__global union skc_path_elem const * const blocks, + __global skc_uint volatile * const atomics, + __global skc_block_id_t const * const map, + __global union skc_cmd_fill const * const cmds_in, + __global union skc_cmd_rasterize * const cmds_out) +{ + // + // Need to harmonize the way we determine a subgroup's id. In this + // kernel it's not as important because no local memory is being + // used. Although the device/mask calc to determine subgroup and + // lanes is still proper, we might want to make it clearer that + // we're working with subgroups by using the subgroup API. + // + // every subgroup/simd that will work on the block loads the same command + // +#if (__OPENCL_VERSION__ < 200) + skc_uint const cmd_stride = get_num_sub_groups(); +#else + skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id(); + + // load fill command -- we reuse y component + union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] }; + + // get the path header block from the map + skc_block_id_t id = map[cmd.fill.path]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("expand[%u] = %u\n",cmd_idx,id); +#endif + + // + // blindly load all of the head elements into registers + // + skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + skc_uint count_nodes, count_prims; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ + count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \ + } \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \ + count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // debug of path head + // +#if 0 + skc_uint count_blocks; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ + count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + if (get_sub_group_local_id() == 0) + printf("path header = { %5u, %5u, %5u }\n", + count_blocks,count_nodes,count_prims); +#endif + + // + // acquire slots in the expanded cmd extent + // + // decrement prim_idx by 1 so we can use inclusive warp scan later + // + skc_uint out_idx = 0; + + if (get_sub_group_local_id() == 0) { + out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP + (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1; + } + + out_idx = sub_group_broadcast(out_idx,0); + + // + // process ids trailing the path header + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ + if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \ + h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \ + } \ + } \ + skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \ + head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, process the nodes + // + + // + // get id of next node + // + id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); + + // + // the following blocks are nodes + // + while (true) + { + // get index of each element + skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); + + // + // blindly load all of the node elements into registers + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // append all valid ids + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \ + node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // any more nodes? + if (--count_nodes == 0) + return; + + // + // get id of next node + // + id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl new file mode 100644 index 0000000000..302ea14af2 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl @@ -0,0 +1,543 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "path.h" +#include "block_pool_cl.h" +#include "path_builder_cl_12.h" +#include "device_cl_12.h" + +// +// +// + +#if 0 + +// +// SIMD AVX2 +// + +#define SKC_PATHS_COPY_WORDS_PER_ELEM 8 +#define SKC_PATHS_COPY_SUBGROUP_SIZE 1 +#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES + +typedef skc_uint8 skc_paths_copy_elem; +typedef skc_uint8 skc_pb_idx_v; + +#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() + +#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) + +#endif + +// +// +// + +#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) +#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) + +// FIXME -- use SUBGROUP terminology everywhere +#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +// +// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL +// + +#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) + +#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) + +// +// +// + +skc_uint +skc_sub_group_local_id() +{ +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + return get_sub_group_local_id(); +#else + return 0; +#endif +} + +// +// convert an atomic read counter offset to a block id +// + +skc_block_id_t +skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, + skc_uint const bp_idx_mask, + skc_uint const bp_reads, + skc_uint const bp_off) +{ + skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; + + return bp_ids[bp_idx]; +} + +// +// +// + +void +skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx) +{ + for (skc_uint ii=0; ii\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem back + (bp_elems+bp_elems_idx)[ii] = elem; + } +} + +// +// +// + +void +skc_host_map_update(__global skc_uint * const host_map, + skc_uint const block, + skc_paths_copy_elem const elem) +{ + // + // write first elem to map -- FIXME -- this is a little nasty + // because it relies on the the host handle always being the first + // word in the path header. + // + // OTOH, this is not unreasonable. The alternative is to have a + // separate kernel initializing the map. + // +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) +#endif + { +#if SKC_PATHS_COPY_ELEM_WORDS == 1 + host_map[elem] = block; +#if 0 + printf("[%u] = %u\n",elem,block); +#endif +#else + host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; +#endif + } +} + +// +// +// + +void +skc_copy_head(__global skc_uint * const host_map, + skc_uint const block, + __global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_reads, + skc_uint const bp_idx_mask, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx, + skc_uint const pb_rolling) +{ + // + // if there are more path header words than there are + // threads-per-block then we can just copy the initial header words + // +#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) + for (skc_uint ii=0; ii= pb_size) + pb_idx -= pb_size; + + // broadcast load the command + union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; + + // what do we want pb_elems do with this block? + skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); + + // compute offset from rolling base to get index into block pool ring allocation + skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); + + // convert the pb_cmd's offset counter pb_elems a block id + skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); + printf("< %8u >\n",block); + } +#endif + + // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() + skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; + + // calculate bp_elems (to) / pb_elems (from) + skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; + skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; + + if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) + { +#if 0 + if (tid == 0) + printf("%3u, segs\n",bp_off); +#endif + skc_copy_segs(bp_elems, + bp_elems_idx, + pb_elems, + pb_elems_idx); + } + else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) + { +#if 0 + if (tid == 0) + printf("%3u, NODE\n",bp_off); +#endif + skc_copy_node(bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } + else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) + { +#if 0 + if (tid == 0) + printf("%3u, HEAD\n",bp_off); +#endif + skc_copy_head(host_map, + block, + bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } +} + +// +// +// + +__kernel +SKC_PATHS_ALLOC_KERNEL_ATTRIBS +void +skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, + __global skc_uint * const bp_alloc, + skc_uint const bp_alloc_idx, + skc_uint const pb_cmd_count) +{ + // + // allocate blocks in block pool + // + skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); + + // store in slot + bp_alloc[bp_alloc_idx] = reads; + +#if 0 + printf("pc: %8u + %u\n",reads,pb_cmd_count); +#endif +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl new file mode 100644 index 0000000000..2aee5dac17 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl @@ -0,0 +1,390 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// FIXME -- a pre-allocation step could load the path header quads and +// total up the number of blocks in the workgroup or subgroup +// minimizing the number of later atomics adds. +// + +#include "block.h" +#include "path.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "device_cl_12.h" + +// +// +// + +#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS) + +#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS) + +// +// +// + +#if ( SKC_PATHS_RECLAIM_X == 1 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_PATHS_RECLAIM_X == 2 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_PATHS_RECLAIM_X == 4 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_PATHS_RECLAIM_X == 8 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_PATHS_RECLAIM_X == 16) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_PATHS_RECLAIM_X" +#endif + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// COMPILE-TIME PREDICATES +// + +#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \ + SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I) + +#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \ + SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I) + +// +// RUN-TIME PREDICATES +// + +#define SKC_PATHS_RECLAIM_IS_HEADER(I) \ + (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS) + +// +// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL +// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK +// COMBOS (NOT NECESSARILY POW2) +// +// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR +// UINT TYPE INSTEAD OF A ULONG. +// + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 +#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint + +// +// +// + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ + (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ + ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ + S = sub_group_scan_exclusive_add(C) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \ + (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK) + +// +// +// + +struct skc_reclaim +{ + skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE]; +}; + +__kernel +SKC_PATHS_RECLAIM_KERNEL_ATTRIBS +void +skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring + __global skc_uint * const bp_elems, // block pool blocks + __global skc_uint volatile * const bp_atomics, // read/write atomics + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const map, // path host-to-device map + struct skc_reclaim const reclaim) // array of host path ids +{ +#if (__OPENCL_VERSION__ < 200) + skc_uint const reclaim_stride = get_num_sub_groups(); +#else + skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); + +#if 0 + // + // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT + // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL + // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE + // RECLAMATION JOB ON THE REST OF THE PIPELINE. + // + for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) +#endif + { + // get host path id + skc_path_h const path = reclaim.aN[reclaim_idx]; + + // get the path header block from the map + skc_block_id_t id = map[path]; + + // + // blindly load all of the head elements into registers + // + skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + skc_uint count_blocks, count_nodes; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ + count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ + } \ + if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ + count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes); + } +#endif + + // + // acquire a span in the block pool ids ring for reclaimed ids + // + // FIXME count_blocks and atomic add can be done in same lane + // + skc_uint bp_ids_base = 0; + + if (get_sub_group_local_id() == 0) { + bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); + +#if 0 + printf("paths: bp_ids_base = %u\n",bp_ids_base); +#endif + } + + bp_ids_base = sub_group_broadcast(bp_ids_base,0); + + // + // shift away the tagged block id's tag + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; + } + + // + // - we'll skip subgroups that are entirely header + // + // - but we need to mark any header elements that partially fill + // a subgroup as invalid tagged block ids + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \ + if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \ + h##I = SKC_TAGGED_BLOCK_ID_INVALID; \ + } \ + } \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + { + // + // count reclaimable blocks in each lane + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = h##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // printf("P %7u ! %u\n",bp_ids_idx,h##I); + } + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, walk the nodes + // + do { + // id of next block is in last lane + id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); + + // get index of each element + skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // + // blindly load all of the node elements into registers + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // shift away the tagged block id's tag + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; + } + + // + // count reclaimable blocks in each lane + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = n##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // printf("P %7u ! %u\n",bp_ids_idx,n##I); + + // any more nodes? + } while (--count_nodes > 0); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl new file mode 100644 index 0000000000..92fa0a243d --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/place.cl @@ -0,0 +1,871 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "common.h" +#include "raster.h" +#include "atomic_cl.h" +#include "device_cl_12.h" + +// +// +// + +#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) +#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) + +// +// +// + +#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK + +// +// +// + +#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) + +// +// +// + +#if ( SKC_PLACE_X == 1 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_1() +#define SKC_PLACE_EXPAND_I_LAST 0 + +#elif ( SKC_PLACE_X == 2 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_2() +#define SKC_PLACE_EXPAND_I_LAST 1 + +#elif ( SKC_PLACE_X == 4 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_4() +#define SKC_PLACE_EXPAND_I_LAST 3 + +#elif ( SKC_PLACE_X == 8 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_8() +#define SKC_PLACE_EXPAND_I_LAST 7 + +#elif ( SKC_PLACE_X == 16) +#define SKC_PLACE_EXPAND() SKC_EXPAND_16() +#define SKC_PLACE_EXPAND_I_LAST 15 +#endif + +// +// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE +// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. +// +// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE +// KERNELS USE DIFFERENT SUBGROUP SIZES. +// +// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE +// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. +// +// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER +// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY +// ONLY SUPPORT A SUBGROUP SIZE OF 16. +// + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) +#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) + +#endif + +// +// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE +// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) +// + +#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) + +#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) + + +// +// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX +// +#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) +#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) + +// +// TTSK v2: +// +// 0 63 +// | TTSB ID | PREFIX | SPAN | X | Y | +// +---------+--------+---------+-----+-----+ +// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | +// +// +// TTPK v2: +// +// 0 63 +// | TTPB ID | PREFIX | SPAN | X | Y | +// +---------+--------+------+-----+-----+ +// | 27 | 1 (=1) | 12 | 12 | 12 | +// +// + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +union skc_subgroup_smem +{ + skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE + + struct { + struct { + skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } lo; + + struct { + skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } hi; + + // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; + }; + +}; + +// +// scatter scan max +// +static +skc_int_v_t +skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, + skc_int_v_t const iss, + skc_int_v_t const ess) +{ + // + // prefix sums determine which lanes we're going to work on next + // + skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); + skc_int_v_t const scratch_idx = max(ess,0); + + // + // SIMT + // + + // + // zero the volatile smem scratchpad using vector syntax + // + smem->scratch[get_sub_group_local_id()] = ( 0 ); + + // + // store source lane at starting lane + // + if (is_scratch_store) { + smem->scratch[scratch_idx] = get_sub_group_local_id(); + } + + // + // propagate lanes to right using max scan + // + skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; + skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); + + return source; +} + +// +// +// + +static +skc_bool +skc_xk_clip(union skc_tile_clip const * const tile_clip, + skc_ttxk_t * const xk) +{ + // + // clip the sk and pk keys + // + // if fully clipped then return false + // + // alternatively -- we can expand all these keys in place + // + // alternatively -- keep sk and pk keys segregated because sk + // represents the vast majority of keys and are easier to process. + // don't mess with the fastpath! + // + return false; +} + +// +// +// + +static +skc_ttck_t +skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk_idx) +{ + skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 + skc_uint const hi = smem->hi.sk[sk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +static +skc_ttck_t +skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk_idx, + skc_uint const dx) +{ + skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 + skc_uint const hi = smem->hi.pk[pk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +// +// +// + +static +void +skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk) +{ + // + // Pretty sure you can never ever have an sk count equal to 0 + // + skc_uint ck_base = 0; + + // last lane performs the block pool allocation with an atomic increment + if (get_sub_group_local_id() == 0) { + ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); + } + + // broadcast base to all lanes + ck_base = sub_group_broadcast(ck_base,0); + + // convert sk keys to ck keys + for (skc_uint ii=get_sub_group_local_id(); iilo.pk[idx]; + skc_uint const hi = smem->hi.pk[idx]; + + skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; + skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; + + return (span_lo | span_hi) + 1; +} + +// +// +// + +static +void +skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk) +{ + // bail out if pk queue is empty + if (pk == 0) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("%u\n",pk); +#endif + + // + // FIXME -- this nested loop iterates over the queue processing a + // subgroup of 64-bit keys at a time. This is probably not the most + // efficient approach so investigate how to store and iterate over a + // wider than subgroup (node-sized) queue of keys. + // + + // round up so we work with full subgroups + skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; + skc_uint ii = 0; + + // nested loop that expands all ttpk keys +#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) + for (; ii\n",xk_idx); +#endif + + return xk_idx; +#endif +} + +// +// +// +__kernel +SKC_PLACE_KERNEL_ATTRIBS +void +skc_kernel_place(__global skc_bp_elem_t * const bp_elems, + __global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __global union skc_cmd_place const * const cmds, + __global skc_block_id_t * const map, + skc_uint4 const clip, + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + __local union skc_subgroup_smem volatile smem[1]; +#else + __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; + __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // This is a subgroup-centric kernel + // + // Which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // + // Test the raster's translated bounds against the composition's + // tile clip + // + // There are 3 cases: + // + // - the raster is completely clipped -> return + // - the raster is partially clipped -> all keys must clipped + // - the raster is not clipped -> no keys are tested + // + // + // There are at least 4 implementations of place and we want to + // special-case them as much as possible so that, at the least, the + // fastpath remains fast. + // + // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP + // + // - implement CLIPPED + NO TRANSLATION path + // + // - implement NO CLIP + TRANSLATION path + // + // - implement CLIPPED + TRANSLATION path + // + // + // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin + // 12:12:8 integer where: + // + // 12: ttsk + // 12: ttpk + // 8: /dev/null -- clipped or invalid key + // + // Three kinds of nodes in a raster's list: + // + // - the head node + // - an internal node + // - the final node + // + +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const cmd_idx = get_group_id(0); +#else + skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // load command + union skc_cmd_place const cmd = cmds[cmd_idx]; + + // get the raster header from the raster host id -- scalar + skc_block_id_t id = map[cmd.raster_h]; + + // + // load all of the head block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const h##I = { \ + .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + + // + // load raster header counts -- we only need the "nodes" and "keys" + // words but the keys we loaded are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + + // + // + // +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + h##I.u32v2.hi,h##I.u32v2.lo, \ + h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // +#if 0 + if (get_sub_group_local_id() == 0) { + printf("place: %u / %u / %u\n",head_id,nodes,keys); + } +#endif + + { + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = h##I.xk.lo; \ + smem->hi.sk[sk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = h##I.xk.lo; \ + smem->hi.pk[pk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // flush the keys + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + } + + // + // we're done if there was only a head node + // + if (nodes == 0) + return; + + // + // decrement keys + // + keys -= SKC_RASTER_HEAD_COUNT_KEYS; + + // + // otherwise, append keys in trailing nodes to smem + // + while (true) + { + // + // load all of the node block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const n##I = { \ + .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + n##I.u32v2.hi,n##I.u32v2.lo, \ + n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = n##I.xk.lo; \ + smem->hi.sk[sk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = n##I.xk.lo; \ + smem->hi.pk[pk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // if total for either the sk or pk queue reaches the + // highwater mark then flush it to the extent + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + + // + // if this was the last node then we're done + // + if (--nodes == 0) + return; + + // + // otherwise decrement keys + // + keys -= SKC_RASTER_NODE_COUNT_KEYS; + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/prefix.cl b/src/compute/skc/platforms/cl_12/kernels/prefix.cl new file mode 100644 index 0000000000..21a51694da --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl @@ -0,0 +1,1041 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "raster.h" +#include "atomic_cl.h" +#include "raster_builder_cl_12.h" +#include "device_cl_12.h" + +// +// INPUT: +// +// TTRK (64-BIT COMPARE) +// +// 0 63 +// | TTSB ID | X | Y | COHORT ID | +// +---------+------+------+-----------+ +// | 27 | 12 | 12 | 13 | +// +// +// TTRK (32-BIT COMPARE) +// +// 0 63 +// | TTSB ID | N/A | X | Y | COHORT ID | +// +---------+-----+------+------+-----------+ +// | 27 | 5 | 12 | 12 | 8 | +// +// +// OUTPUT: +// +// TTSK v2: +// +// 0 63 +// | TTSB ID | PREFIX | N/A | X | Y | +// +---------+--------+------+----+----+ +// | 27 | 1 (=0) | 12 | 12 | 12 | +// +// +// TTPK v1: +// +// 0 63 +// | TTPB ID | ALL ZEROES | SPAN | X | Y | +// +---------+------------+------+-----+-----+ +// | 27 | 1 | 12 | 12 | 12 | +// +// +// TTPK v2: +// +// 0 63 +// | TTPB ID | PREFIX | SPAN | X | Y | +// +---------+--------+------+-----+-----+ +// | 27 | 1 (=1) | 12 | 12 | 12 | +// + +#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1) + +// +// smem accumulator +// + +union skc_subgroup_accum +{ + struct { + SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT]; + } atomic; + + struct { + skc_ttp_t ttp[SKC_TILE_HEIGHT]; + } aN; + + struct { + SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE]; + } vN; + + struct { + SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH]; + } zero; +}; + +// +// +// + +struct skc_subgroup_smem +{ + // prefix accumulator + union skc_subgroup_accum accum; +}; + +// +// +// + +static +skc_uint +skc_subgroup_lane() +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + return get_sub_group_local_id(); +#else + return 0; +#endif +} + +// +// +// + +static +SKC_PREFIX_TTS_V_BITFIELD +skc_tts_get_dy(skc_tts_v_t const ttsv) +{ + // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32] + SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY; + + return dy - (~ttsv >> 31); +} + +static +SKC_PREFIX_TTS_V_BITFIELD +skc_tts_get_py(skc_tts_v_t const ttsv) +{ + return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2); +} + +// +// +// + +static +void +skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v) +{ + // get "altitude" + SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v); + + // get the y pixel coordinate + SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v); + + // + // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid? + // + // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op + // + +#if 0 + if (tts_v != SKC_TTS_INVALID) + printf("< %08X = %u : %d >\n",tts_v,py,dy); +#endif + + // + // scatter-add the "altitude" to accumulator + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \ + } + +#else + // + // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS + // + // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C == SKC_TTS_INVALID) \ + return; \ + smem->accum.aN.ttp[py C] = dy C; +#endif + + SKC_PREFIX_TTS_VECTOR_INT_EXPAND(); +} + +// +// The implication here is that if our device configuration has a +// rectangular 1:2 tile then we need a block size of at least 2 +// subblocks. The subblock size of course needs to match the length of +// the smallest tile side. +// + +static +void +skc_accum_flush(__local struct skc_subgroup_smem * const smem, + __global skc_bp_elem_t * const bp_elems, + skc_block_id_t const pb_id) +{ + // load the ttp elements + SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()]; + skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); + +#if ( SKC_TILE_RATIO == 1 ) + + bp_elems[offset] = ttp_v; + +#elif ( SKC_TILE_RATIO == 2 ) + + vstore2(ttp_v,offset,bp_elems); + +#else + +#error("tile ratio greater than 2 not supported") + +#endif +} + +// +// +// + +static +void +skc_accum_reset(__local struct skc_subgroup_smem * const smem) +{ + for (uint ii=0; iiaccum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 ); +} + +// +// get next sk key +// + +static +skc_ttsk_s_t +skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v, + skc_uint * const sk_next, + skc_int * const rkpk_rem) +{ + // decrement count + *rkpk_rem -= 1; + +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT with subgroup support is easy + // + // SIMT without subgroup support can always emulate with smem + // +#if 0 + // + // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly + // broadcast a uint2 cast to a long. It was probably bad to do this + // anyway without a union wrapping the TTSK scalar type. + // + // Consider creating a union { ulong; uint2 } at a later date -- + // probably no need to ever do this unless it makes broadcast faster + // which is unlikely since it will probably be implemented as 2 + // 32-bit broadcasts. + // + // Additionally, the TTRK and TTXK key bitfield sizes are probably + // cast in stone and we aren't going to change them no matter + // architecture we're on. + // + skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++); +#else + skc_ttsk_s_t sk_s; + + sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next); + sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next); + *sk_next += 1; +#endif + +#else + // + // SIMD will always grab component .s0 and then rotate the vector + // + sk_s = ( sk_v->s0 ); + + skc_ttsk_v_rotate_down(sk_v); + +#endif + + return sk_s; +} + +// +// +// + +static +skc_raster_yx_s +skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next) +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT with subgroup support is easy + // + // SIMT without subgroup support can always emulate with smem + // + skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next); + +#else + // + // SIMD will always grab component .s0 and then rotate the vector + // + skc_raster_yx_s const yx_s = ( sk_v->s0.hi ); + +#endif + + return yx_s; +} + +// +// mask off ttsb id +// + +static +skc_block_id_s_t +skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s) +{ + return ( sk_s->lo & SKC_TTXK_LO_MASK_ID ); +} + +// +// load tts_v as early as possible +// + +static +skc_tts_v_t +skc_load_tts(__global skc_bp_elem_t * const bp_elems, + skc_block_id_s_t const sb_id) +{ + return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] ); +} + +// +// massage ttrk keys into ttsk keys +// + +static +void +skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v) +{ + sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits + sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits +} + +// +// replenish ttsk keys +// + +static +void +skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v, + skc_uint * const sk_next, + skc_uint * const rks_next, + __global skc_ttrk_e_t const * const rks) +{ + // if there are still keys available then return + if (*sk_next < SKC_PREFIX_TTXK_V_SIZE) + return; + + // + // otherwise, replenish sk_v + // + // NOTE NOTE NOTE -- we are assuming rks[] extent size is always + // divisible by TTXK_V_SIZE and therefore loading some keys from the + // next raster is OK. + // + *sk_next = 0; + *rks_next += SKC_PREFIX_SUBGROUP_SIZE; + *sk_v = rks[*rks_next]; + +#if 0 + printf("* %08X ( %3u, %3u )\n", + sk_v->hi, + (sk_v->hi >> 12) & 0xFFF, + (sk_v->hi ) & 0xFFF); +#endif + + skc_ttrk_to_ttsk(sk_v); + +#if 0 + printf("! %08X ( %3u, %3u )\n", + sk_v->hi, + (sk_v->hi >> 20) & 0xFFF, + (sk_v->hi >> 8) & 0xFFF); +#endif +} + +// +// replenish block ids +// +// note that you can't overrun the block id pool since it's a ring +// + +static +void +skc_blocks_replenish(skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) + +{ + *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE; + *blocks = bp_ids[*blocks_idx & bp_mask]; + *blocks_next = 0; + +#if 0 + printf("replenish blocks: %u\n",*blocks); +#endif +} + +// +// +// + +static +skc_block_id_t +skc_blocks_get_next(skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // replenish? + if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE) + { + skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + } + +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); + +#else + // + // SIMD + // + skc_block_id_t id = blocks->s0; + + skc_shuffle_down_1(*blocks); + +#endif + + *blocks_next += 1; + + return id; +} + +// +// subblock allocator +// + +#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) + +static +skc_block_id_t +skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks, + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + } + + skc_block_id_t const pb_id = *subblocks; + + *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks + + return pb_id; +} + +#endif + +// +// append a ttsk key to the work-in-progress node +// + +static +void +skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s, + + skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + skc_uint * const xk_v_idx, + __global skc_bp_elem_t * const bp_elems, + + skc_int const rkpk_rem, + + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, + __global skc_block_id_t const * const bp_ids) +{ + // + // Append an sk key to the in-register xk_v vector + // + // If the work-in-progress node in gmem will only have room for one + // more key then: + // + // - if this was the final SK then write out xk_v and exit + // + // - otherwise, acquire a block id, link it, write out xk_v, + // prepare new node + // + // Note that this does *not* try to squeeze in a final key into the + // next node slot. This optimization isn't worth the added + // down-pipeline complexity. + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) + { + *xk_v = *sk_s; + } + + *xk_v_next += 1; + + // are there more keys coming? + if (rkpk_rem > 0) + { + // is the node almost full? + if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) + { + skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + + if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) + { + xk_v->lo = id; + xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary + } + + // store xk_v (uint2) to bp (uint) + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // update node elem idx + *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // reset node count + *xk_v_next = 0; + } + // is xk_v full? + else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) + { + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // increment node elem idx + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + } + } + else + { + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) + { + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + + bp_elems[*xk_v_idx] = SKC_UINT_MAX; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; + } + } + +#else + // + // SIMD + // + +#endif +} + +// +// +// + +static +skc_ttpk_s_t +skc_ttpk_create(skc_raster_yx_s const yx_prev, + skc_raster_yx_s const yx_next, + skc_block_id_t const pb_id) +{ + // - yx_prev is already incremented by one + // - yx_span is already shifted up at hi.x + skc_uint const yx_span = yx_next - yx_prev; + + skc_ttpk_s_t pk; + + // turn on prefix bit | shift span bits upward + pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN); + + // shift down high span bits | yx of tile + pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("* %08v2X : %u\n",pk,yx_span); +#endif + + return pk; +} + +// +// append a ttpk key to the work-in-progress node +// + +static +void +skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s, + + skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + skc_uint * const xk_v_idx, + __global skc_bp_elem_t * const bp_elems, + + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, + __global skc_block_id_t const * const bp_ids) +{ + // + // append a pk key to the in-register xk_v vector + // + // if the work-in-progress node in gmem will only have room for one + // more key then: + // + // - if this was the final SK then write out xk_v and exit + // + // - otherwise, acquire a block id, link it, write out xk_v, + // prepare new node + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) + { + *xk_v = *pk_s; + } + + *xk_v_next += 1; + + // is the node almost full? + if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) + { + skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + + if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) + { + xk_v->lo = id; + xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary + } + + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // update node elem idx + *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // reset node count + *xk_v_next = 0; + } + // is xk_v full? + else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) + { + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // increment node elem idx + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + } + +#else + // + // SIMD + // +#endif +} + +// +// append the first 3 fields of meta info to the raster header +// + +static +void +skc_node_v_init_header(skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + union skc_raster_cohort_meta_out const * const meta) +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() < 2) + { + *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi; + } + +#if 0 + if (get_sub_group_local_id() == 0) + printf("header: %08v4X\n",meta->u32v4); +#endif + + // + // increment counter: uint4 + uint4 = uint2 x 4 + // + *xk_v_next = 2 + 2; // +2 for unitialized bounds + +#else + // + // SIMD + // + +#endif +} + +// +// +// + +__kernel +SKC_PREFIX_KERNEL_ATTRIBS +void +skc_kernel_prefix(__global skc_uint const * const bp_atomics, + __global skc_block_id_t const * const bp_ids, + __global skc_bp_elem_t * const bp_elems, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_ttrk_e_t const * const rks, + __global skc_block_id_t * const map, + __global skc_uint const * const metas, + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem smem[1]; +#else + __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id(); +#endif + + // + // where is this subgroup in the grid? + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const sgi = get_group_id(0); +#else + skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + skc_uint const sgl = get_sub_group_local_id(); + + // + // return if this subgroup is excess + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 ) + if (sgi >= count) + return; +#endif + + // + // get meta info for this subgroup's raster + // + union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) }; + skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("%3u : %5u / %5u / %5u / %5u / %u\n", + sgi, + meta.blocks, + meta.offset, + meta.nodes, + meta.keys, + reads); +#endif + + // + // preload blocks -- align on subgroup + // + skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); + skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask]; + skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK); + + // + // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset + // + skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // + // initialize raster header -- assumes block is greater than 8 words (4 doublewords) + // + skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX }; + skc_uint xk_v_next; + + skc_node_v_init_header(&xk_v,&xk_v_next,&meta); + + // + // no keys -- this is an empty raster! + // + if (meta.keys == 0) + { + bp_elems[xk_v_idx ] = xk_v.lo; + bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi; + + while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) + { + xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + + bp_elems[xk_v_idx] = SKC_UINT_MAX; + bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; + } + + return; + } + + // + // load TTRK keys and in-place convert to TTSK keys + // + skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); + skc_ttsk_v_t sk_v = rks[rks_next]; + skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK); + skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys + +#if 0 + printf("* %08X ( %3u, %3u )\n", + sk_v.hi, + (sk_v.hi >> 12) & 0xFFF, + (sk_v.hi ) & 0xFFF); +#endif + + skc_ttrk_to_ttsk(&sk_v); + +#if 0 + printf("! %08X ( %3u, %3u )\n", + sk_v.hi, + (sk_v.hi >> 20) & 0xFFF, + (sk_v.hi >> 8) & 0xFFF); +#endif + + // + // subblocks + // +#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) + skc_block_id_t subblocks = 0; +#endif + + // + // begin "scan" of tiles + // + skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next); + + // + // zero the accumulator + // + skc_accum_reset(smem); + + while (true) + { + // get next rk key + skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem); + + // load ttsb id + skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s); + + // load tts_v transaction "in flight" as early as possible + skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id); + +#if 0 + printf("{ %08X }\n",tts_v); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("[ %d, %X ]\n",rkpk_rem,sb_id); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF); +#endif + + // + // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF + // TIME AND SIMD'IZED + // + + // if yx's don't match then we're either issuing a ttpk or + // resetting the accumulator + if (sk_s.hi != yx_prev) + { + // if yx_next.y == yx_last.y then x changed + if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0) + { + // + // if the tile is not square then it's ratio is 1:2 + // +#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 + skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks, + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); +#else + skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); +#endif + + // flush accumulated ttp vector to block/subblock at ttpb_id + skc_accum_flush(smem,bp_elems,pb_id); + +#if 0 + if (get_sub_group_local_id() == 0) + { + printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n", + pb_id, + (yx_prev >> SKC_TTXK_HI_OFFSET_Y), + (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF, + (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF, + (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF); + } +#endif + + // + // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP + // + rkpk_rem -= 1; + + // create the pk + skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id); + + // append pk key to xk buffer + skc_node_v_append_pk(&pk_s, + + &xk_v, + &xk_v_next, + &xk_v_idx, + bp_elems, + + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); + } + else if (rkpk_rem > 0) // we're starting a new tile row + { + skc_accum_reset(smem); + } + } + + // + // append sk key to node_v + // + // if rkpk_rem is zero then return from kernel + // + skc_node_v_append_sk(&sk_s, + + &xk_v, + &xk_v_next, + &xk_v_idx, + bp_elems, + + rkpk_rem, + + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); + + // we're done if no more sk keys + if (rkpk_rem == 0) + break; + + // move to new tile + yx_prev = sk_s.hi; + + // scatter tts values into accumulator + skc_accum_scatter(smem,tts_v); + + // replenish sk keys + skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl new file mode 100644 index 0000000000..e622845d9c --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl @@ -0,0 +1,3366 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "raster_builder_cl_12.h" +#include "device_cl_12.h" + +// #define SKC_ARCH_AVX2 +// #define SKC_RASTERIZE_SIMD_USES_SMEM + +#define PRINTF_ENABLE 0 +#define PRINTF_BLOCK_COUNT 0 + +// +// NOTE: +// +// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT +// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE +// +// NOTE: +// +// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. +// +// + +#if 0 // SKC_ARCH_AVX2 + +// #define SKC_RASTERIZE_SUBGROUP_SIZE 1 +// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 +// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 + +// #define SKC_TTXB_WORDS 8 + +// #define SKC_RASTERIZE_FLOAT float8 +// #define SKC_RASTERIZE_UINT uint8 +// #define SKC_RASTERIZE_INT int8 +// #define SKC_RASTERIZE_PREDICATE int8 + +// #define SKC_RASTERIZE_BIN_BLOCK uint16 +// #define SKC_RASTERIZE_BIN uint8 + +// #define SKC_RASTERIZE_POOL uint8 +// #define SKC_RASTERIZE_POOL_SCALE 6 + +// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 +// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 + +// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() + +#endif + +// +// SIMT +// + +#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE +#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE +#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) + +// +// +// + +#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) +#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) + +// +// +// + +#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } +#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } + +// +// +// + +#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) +#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) +#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) +#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) +#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) +#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) + +// +// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" +// +// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ +// +// Lerp in two fma/mad ops: +// +// t * b + ((-t) * a + a) +// +// Note: OpenCL documents mix() as being implemented as: +// +// a + (b - a) * t +// +// But this may be a native instruction on some devices. For example, +// on GEN9 there is an LRP "linear interoplation" opcode but it +// doesn't appear to support half floats. +// +// Feel free to toggle this option and then benchmark and inspect the +// generated code. We really want the double FMA to be generated when +// there isn't support for a LERP/MIX operation. +// + +#if 1 +#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) +#else +#define SKC_LERP(a,b,t) mix(a,b,t) +#endif + +// +// There is no integer MAD in OpenCL with "don't care" overflow +// semantics. +// +// FIXME -- verify if the platform needs explicit MAD operations even +// if a "--fastmath" option is available at compile time. It might +// make sense to explicitly use MAD calls if the platform requires it. +// + +#if 1 +#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) +#else +#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) +#endif + +// +// +// + +#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) + +// +// +// + +union skc_bp_elem +{ + skc_uint u32; + skc_tagged_block_id_t tag_id; + skc_float coord; +}; + +// +// +// + +struct skc_subgroup_smem +{ + // + // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) + struct { + union { + + skc_uint winner; + + struct { + skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; + } aN; + + struct { + SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; + } vN; + }; + } subgroup; +#endif + + // + // work-in-progress TTSB blocks and associated YX keys + // + union { + struct { + // FIXME -- some typedefs are valid here + skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; + skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + } aN; +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + struct { + SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + SKC_RASTERIZE_BIN yx; + SKC_RASTERIZE_BIN id; + SKC_RASTERIZE_BIN count; + } vN; +#endif + } bin; +}; + +// +// +// + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) +#define skc_subgroup_lane() 0 +#else +#define skc_subgroup_lane() get_sub_group_local_id() +#endif + +// +// replenish block ids +// +// note that you can't overrun the block id pool since it's a ring +// + +static +void +skc_blocks_replenish(skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // + // get a new vector of block ids -- this is kind of a narrow + // allocation but subblocks help stretch out the pool. + // + // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids + // + skc_uint bp_idx = 0; + + if (skc_subgroup_lane() == 0) + { + bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, + SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads +#if 0 + printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); +#endif + } + + bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; + *blocks = bp_ids[bp_idx]; + *blocks_next = 0; +} + +// +// +// + +static +skc_block_id_t +skc_blocks_get_next(skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // replenish? + if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) + { + skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); + } + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); + +#else + // + // SIMD + // + skc_block_id_t id = blocks->s0; + + skc_shuffle_down_1(*blocks); + +#endif + + *blocks_next += 1; + + return id; +} + +// +// subblock allocator +// + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + +static +skc_block_id_t +skc_subblocks_get_next(skc_block_id_t * const subblocks, + skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); + } + + skc_block_id_t const sb_id = *subblocks; + + *subblocks += 1; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("= %u\n",sb_id); +#endif + + return sb_id; +} + + +#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks +#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks + +#else + +#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks +#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks + +#endif + +// +// +// + +static +skc_block_id_t +skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), + skc_uint * const blocks_next, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + skc_uint const new_yx) +{ +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, + blocks_next, + blocks, + bp_atomics, + bp_mask, + bp_ids); +#else + skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, + blocks, + bp_atomics, + bp_mask, // pow2 modulo mask for block pool ring + bp_ids); +#endif + + if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) + { + sk_v->lo = new_id; + sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; +#if 0 + printf("@ ( %3u, %3u ) %u\n", + (new_yx >> 12) & 0xFFF, + (new_yx ) & 0xFFF, + new_id); +#endif + } + + *sk_v_next += 1; + + if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) + { + *sk_v_next = 0; + + skc_uint sk_idx = 0; + + if (skc_subgroup_lane() == 0) + { + sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); +#if 0 + printf("+ %u\n",sk_idx); +#endif + } + + sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) + if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) +#endif + { + sk_extent[sk_idx] = *sk_v; +#if 0 + printf("> %u : %v2u\n",sk_idx,*sk_v); +#endif + } + } + + return new_id; +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 0 + + // -- + // 01 + SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 012 + + // ---- + // 0123 + // 01 + + // ---- + // 0123 + // + SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 0123456 + + // -------- + // 01234567 + // 012345 + + // -------- + // 01234567 + // 0123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); + SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 0123456789abcde + + // ---------------- + // 0123456789abcdef + // 0123456789abcd + + // ---------------- + // 0123456789abcdef + // 0123456789ab + + // ---------------- + // 0123456789abcdef + // 01234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); + SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); + SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_add(v); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 0 + + // -- + // 01 + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 012 + + // ---- + // 0123 + // 01 + + // ---- + // 0123 + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 0123456 + + // -------- + // 01234567 + // 012345 + + // -------- + // 01234567 + // 0123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); + SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 0123456789abcde + + // ---------------- + // 0123456789abcdef + // 0123456789abcd + + // ---------------- + // 0123456789abcdef + // 0123456789ab + + // ---------------- + // 0123456789abcdef + // 01234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); + SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); + SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_add(v); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 00 max + // -- + // 01 + SKC_RASTERIZE_UINT const w = max(v.s00,v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 0012 + + // ---- + // 0123 + // 0101 + + // ---- + // 0123 + // + SKC_RASTERIZE_UINT const w = max(v.s0012,v); + SKC_RASTERIZE_UINT const x = max(w.s0101,w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 00123456 + + // -------- + // 01234567 + // 01012345 + + // -------- + // 01234567 + // 01230123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_UINT const w = max(v.s00123456,v); + SKC_RASTERIZE_UINT const x = max(w.s01012345,w); + SKC_RASTERIZE_UINT const y = max(x.s01230123,x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 00123456789abcde + + // ---------------- + // 0123456789abcdef + // 010123456789abcd + + // ---------------- + // 0123456789abcdef + // 01230123456789ab + + // ---------------- + // 0123456789abcdef + // 0123456701234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); + SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); + SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); + SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_max(v); + +#endif +} + +// +// +// + +static +float +skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return v.s1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return v.s3; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return v.s7; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return v.sf; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return v.s1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return v.s3; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return v.s7; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return v.sf; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); + +#endif +} + +// +// +// + +static +float +skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#else + return v.s0; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,0); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, + SKC_RASTERIZE_UINT const i) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#else + return shuffle(v,i); +#endif + +#else + // + // SIMT + // + return intel_sub_group_shuffle(v,i); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous + SKC_RASTERIZE_FLOAT const c) // current +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // FIXME -- there are alternative formulations here: + // + // Option 1: + // + // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) + // + // Option 2: + // + // p is a scalar + // t = c.rotate(+1) + // t.s0 = p; + // + // Option 3: ... + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return p; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return shuffle2(p,c,(uint2)(1,2)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return shuffle2(p,c,(uint4)(3,4,5,6)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); +#endif + +#else + // + // SIMT + // + return intel_sub_group_shuffle_up(p,c,1); + +#endif +} + +// +// +// + +static +bool +skc_is_lane_first() +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) + // + // SIMD + // + return true; +#else + // + // SIMT + // + return get_sub_group_local_id() == 0; +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_delta_offset() +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return 1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); +#endif + +#else + // + // SIMT + // + return 1.0f + get_sub_group_local_id(); + +#endif + +} + +// +// +// + +static +int +skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + return any(p); +#else + // + // SIMT + // + return sub_group_any(p); +#endif +} + +// +// +// + +#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) + +void +skc_segment_next(__global union skc_bp_elem * const bp_elems, + skc_uint * const nodeword, + skc_block_id_t * const id) +{ + if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) + { + *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; + } + + skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; + + *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + } +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) +{ + return native_sqrt(x * x + y * y); +} + +// +// Wang's Formula (1985) +// + +#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned + +#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) + +#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) +#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) + +#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) +#define SKC_WANG_SQRT(x) native_sqrt(x) + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, + SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, + SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, + SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) +{ + // + // Return the number of evenly spaced (in the parametric sense) line + // segments that are guaranteed to be within "epsilon" error of the + // curve. + // + // We're then going to take multiples of the reciprocal of this + // number so that the segmentation can be distributed across the + // subgroup. + // + // Note, this can probably be slightly optimized per architecture + // but it's probably far from being a hotspot since it's all + // straight-line unpredicated code. + // + // The result is an integer ranging from [1.0,#segments] + // + // Note that even if all of the control points are coincident, the + // max(1.0f) will categorize this as a line of 1 segment. + // + // This is what we want! We want to convert cubics to lines as + // easily as possible and *then* cull lines that are either + // horizontal or zero length. + // + return max(1.0f, + ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * + SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), + fabs(t3x - 2.0f * t2x + t1x)), + max(fabs(t2y - 2.0f * t1y + t0y), + fabs(t3y - 2.0f * t2y + t1y)))))); +} + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, + SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, + SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) +{ + return max(1.0f, + ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * + SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x), + fabs(t2y - 2.0f * t1y + t0y))))); +} + +// +// rational curves +// + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_cubic_rat() +{ + return 0.0f; +} + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_quad_rat() +{ + return 0.0f; +} + +// +// flush any work-in-progress blocks and return unused block ids +// + +static +void +skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_v_t * const blocks, + skc_uint const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem) +{ + // + // flush non-empty bins + // + // FIXME -- accelerate this iteration/search with a subgroup operation + // + for (skc_uint ii=0; iibin.aN.count[ii] > 0) + { + skc_block_id_v_t const id = smem->bin.aN.id[ii]; + skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; +#if 0 + printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); +#endif + bp_elems[idx].u32 = tts; + } + + // + // FIXME -- vectorize with vstoreN() + // + } + + // + // return remaining block ids back to the pool + // + skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; + + if (blocks_rem > 0) + { + skc_uint bp_idx = 0; + + if (skc_subgroup_lane() == 0) + { + bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); + +#if 0 + printf("r-: %8u + %u\n",bp_idx,blocks_rem); +#endif + } + + bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; + + if (skc_subgroup_lane() >= blocks_next) + { + bp_ids[bp_idx] = *blocks; + } + } + + // + // flush work-in-progress ryx keys + // + if (sk_v_next > 0) + { + skc_uint sk_idx = 0; + + if (skc_subgroup_lane() == 0) + { + sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); +#if 0 + printf("* %u\n",sk_idx); +#endif + } + + sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); + + if (skc_subgroup_lane() < sk_v_next) + { + sk_extent[sk_idx] = *sk_v; + } + } +} + +// +// If there are lanes that were unable to append to a bin because +// their hashes collided with a bin's current ryx key then those bins +// must be ejected. +// +// Note that we do not eject "full" bins because lazily waiting for a +// collision results in simpler code. +// + +static +void +skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_t * const subblocks, + skc_block_id_v_t * const blocks, + skc_uint * const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_UINT const hash, + SKC_RASTERIZE_UINT const yx, + SKC_RASTERIZE_PREDICATE is_collision) // pass by value +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + + // + // FIXME -- this code is now stale with the changes to the + // subblock/block allocation strategy + // + + // + // get local TTSB ID queue count + // + skc_uint ttsb_id_count = smem->pool.count; // scalar + + // init hash bit mask + skc_uint component_mask = 0; + + for (int cc=0; ccbin.aN.count[winner] > 0) + { + skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + + bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; + } + + // + // ensure there is at least one TTSK and TTSB ID + // + if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) + { + // + // update remaining count + // + ttsb_id_count = 0; + + // + // flush accumulated ttsk_ryx keys + // + uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count + +#if 0 + printf("# %u\n",idx); +#endif + + for (uint ii=0; iipool.aN.id[ii] = bp_ids[id + ii]; + } + + // + // invalidate the winning block + // + + // + // update bin with winning yx, new ttsb id and zero count + // + // all lanes are loading/storing from/to the same index + // + smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); + smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; + smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; + smem->bin.aN.count[winner] = 0; + + // + // update count + // + ttsb_id_count += 1; + } + + // + // save count + // + smem->pool.count = ttsb_id_count; + +#else + // + // SIMT + // + + do { + // + // only one lane will win! + // + if (is_collision) + smem->subgroup.winner = hash; + + barrier(CLK_LOCAL_MEM_FENCE); + + // + // which bin is being ejected? + // + skc_uint const winner = smem->subgroup.winner; + + // + // which colliding hash is taking over the bin? + // + SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); + + // + // all lanes with the same hash will try to store but only one + // lane will win + // + if (is_winner) + smem->subgroup.winner = yx; + + barrier(CLK_LOCAL_MEM_FENCE); + + // + // flush this block to the pool + // + if (smem->bin.aN.count[winner] > 0) + { + skc_block_id_v_t const id = smem->bin.aN.id[winner]; + skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; +#if 0 + printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); +#endif + bp_elems[idx].u32 = tts; + } + + // + // append new ttsk + // + skc_uint const new_yx = smem->subgroup.winner; + skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), + blocks_next, + bp_atomics, + bp_mask, // pow2 modulo mask for block pool ring + bp_ids, + cohort_atomics, + sk_v, + sk_v_next, + sk_extent, + new_yx); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf(">>> %9u\n",new_id); + } +#endif + + // + // update bin with winning yx, new ttsb id and zero count + // + smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; + smem->bin.aN.yx [winner] = new_yx; + smem->bin.aN.id [winner] = new_id; + smem->bin.aN.count[winner] = 0; + + // + // remove all lanes matching this hash + // + is_collision = is_collision && !is_winner; + + // + // exit if nothing left to do + // + } while (sub_group_any(is_collision)); + +#endif +} + +// +// scatter scan max +// +static +SKC_RASTERIZE_UINT +skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_FLOAT const iss, + SKC_RASTERIZE_FLOAT const ess) +{ + // + // prefix sums determine which lanes we're going to work on next + // + SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); + SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#ifdef SKC_RASTERIZE_SIMD_USES_SMEM + // + // SIMD APPROACH 1: SIMT'ISH + // + + // zero the volatile smem scratchpad using vector syntax + smem->subgroup.vN.scratch[0] = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_scratch_store C) \ + smem->subgroup.aN.scratch[scratch_idx C] = I; + + SKC_RASTERIZE_VECTOR_EXPAND(); + + // propagate lanes to right using max scan + SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; + SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); + +#else + // + // SIMD APPROACH 2: SCALAR'ISH + // + + SKC_RASTERIZE_UINT source = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_scratch_store C) \ + ((uint *)&source)[scratch_idx C] = I; + + SKC_RASTERIZE_VECTOR_EXPAND(); + + for (uint ii=1; iisubgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); + + // + // store source lane at starting lane + // + if (is_scratch_store) + smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); + + // + // propagate lanes to right using max scan + // + SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; + SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); +#endif + + return source; +} + +// +// sliver lines into subpixels +// + +static +void +skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_t * const subblocks, + skc_block_id_v_t * const blocks, + skc_uint * const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_FLOAT const l0x, + SKC_RASTERIZE_FLOAT const l0y, + SKC_RASTERIZE_FLOAT const l1x, + SKC_RASTERIZE_FLOAT const l1y) +{ + // + // Y-SLIVERING + // ----------- + // + // immediately sliver all multi-pixel lines in into 1-pixel high + // lines + // + // note this implicitly squelches horizontal lines + // + // there is another test for horizontal lines after x-slivering + // is complete + // + + // + // will we need to flip the sign of y_delta ? + // + SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); + SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; + + // + // save 1/dy + // + SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); + + // + // how many non-horizontal subpixel y-axis slivers are there? + // + SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; + SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; + + // + // inclusive subgroup scan of y_segs + // + SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); + SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; + float y_rem = skc_subgroup_last_float(y_iss); + + // + // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails + // + if (y_segs == 0.0f) + y_iss = 0.0f; + +#if 0 + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); +#endif + + // + // these values don't matter on first iteration + // + SKC_RASTERIZE_FLOAT n1x_prev = 0; + SKC_RASTERIZE_FLOAT n1y_prev = 0; + + // + // loop until done + // + while (y_rem > 0.0f) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); + + // + // get line at y_source line + // + SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); + SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); + SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); + SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); + + // + // every lane will create a 1 pixel tall line "sliver" + // + // FIXME -- this gets expanded on SIMD + // + // if numerator == 1 then this is the first lane + // if numerator == s then this is the last lane + // + SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); + SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); + + SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); + + // toggle y_delta sign + SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); + + // + // calculate "right" line segment endpoint + // + SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; + SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); + SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); + + // + // override c1 if this is last point + // + n1y = select(n1y,m1y,is_y_last); + n1x = select(n1x,m1x,is_y_last); + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); + SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); + + // + // override shuffle up if this is the first line segment + // + n0y = select(n0y,m0y,is_y_first); + n0x = select(n0x,m0x,is_y_first); + + // + // save previous right endpoint + // + n1x_prev = n1x; + n1y_prev = n1y; + + // + // decrement by subgroup size + // + y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + +#if 0 + // + // debug + // + if (n0y != n1y) { + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); + } +#endif + + // + // X-SLIVERING + // ----------- + // + // now sliver 1-pixel high lines into at either vertical or + // 1-pixel wide lines + // + // save original direction and work with increasing x + // + SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); + SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; + + // + // save 1/dy + // + SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); + + // + // how many non-horizontal subpixel y-axis slivers are there? + // + SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; + SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); + + // + // inclusive subgroup scan of y_segs + // + SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); + SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; + float x_rem = skc_subgroup_last_float(x_iss); + + // + // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails + // + //if (x_segs == 0.0f) + // x_iss = 0.0f; + + // + // these values don't matter on first iteration + // + SKC_RASTERIZE_FLOAT p1x_prev = 0; + SKC_RASTERIZE_FLOAT p1y_prev = 0; + + // + // loop until done + // + while (x_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); + + // + // get line at y_source line + // + SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); + SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); + SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); + SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); + + // + // every lane will create a 1 pixel tall line "sliver" + // + // FIXME -- this gets expanded on SIMD + // + // if numerator == 1 then this is the first lane + // if numerator == s then this is the last lane + // + SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); + SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); + + SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); + + // toggle x_delta sign + SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); + + // + // calculate "right" line segment endpoint + // + SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; + SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); + SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); + + // + // override c1 if this is last point + // + p1x = select(p1x,o1x,is_x_last); + p1y = select(p1y,o1y,is_x_last); + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); + SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); + + // + // override shuffle up if this is the first line segment + // + p0x = select(p0x,o0x,is_x_first); + p0y = select(p0y,o0y,is_x_first); + + // + // save previous right endpoint + // + p1x_prev = p1x; + p1y_prev = p1y; + + // + // decrement by subgroup size + // + x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // only non-horizontal subpixel lines are valid + // + SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); + + // + // if no lanes are active then continue + // + // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY + // IMPACTS PERFORMANCE (+12% ?) + // + // IT SHOULDN'T !!! + // +#if 0 + if (!skc_subgroup_any(is_active)) + continue; +#endif + + // + // Option 1: use SLM for explicitly managed coalesced stores + // + // 1. which tile does this line belong? + // 2. hash tile coordinates + // 3. lookup hash + // 4. if tile matches then SLM append keys + // 5. if tile doesn't match + // a. flush + // b. create new TTSK_RYX + // c. obtain TTSB block from pool + // d. goto 3. + // + + // + // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores + // + // 1. which tile does this line belong? + // 2. hash tile coordinates + // 3. lookup hash + // 4. if tile matches then GMEM append keys + // 5. if tile doesn't match + // a. flush (and invalidate empty elems) + // b. create new TTSK_RYX + // c. obtain TTSB block from pool + // d. goto 3. + // + + // + // The virtual rasterization surface is very large and + // signed: +/- ~64K-256K, depending on the architecture. + // + // Rasters must be clipped to the virtual surface and, + // optionally, clipped even further on a per raster + // basis. + // + + // + // Clip to the per-raster clip + // + + /* + + CLIP HERE + + */ + + // + // Hash the tile coordinates + // + // This table lists nominal values for each architecture. + // We want to choose values that are naturally fit the + // "width" of the architecture. + // + // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS + // ---- ------- ---- --------- -------- --------- + // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? + // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* + // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? + // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* + // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon + // + // NOTE: When possible, bias the hash toward using more y + // bits because of: + // + // 1. the 90 degree counter-clockwise rotation that we put + // in place to offset the render-time clockwise + // rotation + // + // 2. the likely presence of left-to-right or + // right-to-left glyphs. + // + // For power-of-two bins, the hash is easy. + // + // For non-power-of-two, we may want to either implement a + // fast mod (compiler should do this for us... hahahaha) or + // drop down to the next power-of-two. + // + + // + // FIXME -- this snarl is not good -- can probably reduce + // some of the sign casting but some is there to vectorize a + // scalar + // + SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); + SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); + + SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); + SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); + + SKC_RASTERIZE_INT const min_y = min(z0y,z1y); + SKC_RASTERIZE_INT const max_y = max(z0y,z1y); + + SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; + + SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; + SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); + + // + // map [+1,+32] to [ 0,+31] + // map [-1,-32] to [-1,-32] + // + SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; + + SKC_RASTERIZE_INT const min_x = min(z0x,z1x); + SKC_RASTERIZE_INT const max_x = max(z0x,z1x); + SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; + + SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; + SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); + + SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; + + SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | + (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); + + SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); + +#if 0 + printf("(%3u, %3u)\n",tile_y,tile_x); +#endif + +#if 0 + if (is_active) + printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); +#endif + + // + // debug + // +#if 0 // PRINTF_ENABLE + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_active C) \ + printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); + + SKC_RASTERIZE_VECTOR_EXPAND(); +#else + if (is_active) + printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); +#endif + +#endif + // + // flush all active lanes + // + while (true) + { + // + // either gather load or vector load+shuffle the yx keys + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; + SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); +#else + SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; +#endif + + // + // does yx for lane match yx for hash? + // + SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; + SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); + + // + // OpenCL spec: "When casting a bool to a vector integer + // data type, the vector components will be set to -1 + // (i.e. all bits set) if the vector bool value is true + // and 0 otherwise. + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; +#else + SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} +#endif + // + // how many new elements for each matching hash bin? + // + SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; + SKC_RASTERIZE_UINT const h = h_match << h_shl; + + // + // prefix sum all of the bins in parallel + // + SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); + SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); + + // + // current bin counts + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; + SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); +#else + SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; +#endif + + // + // calculate where each cache-hit and in-bounds tts should be stored + // + SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; + SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; + + // + // which lanes can append to a matching bin? + // + SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); + + // + // scatter append tts elements to bin blocks + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) + // + // SIMD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_append C) \ + { \ + smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ + smem->bin.aN.count[hash C] = count_new C; \ + } + + SKC_RASTERIZE_VECTOR_EXPAND(); +#else + // + // SIMT + // + if (is_append) + { + smem->bin.aN.ttsb [hash][ttsb_index] = tts; + smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS + } +#endif + // + // try to keep predicate updates SIMD-friendly and + // outside of predicated code paths -- this is not + // always how we would normally do things on SIMT but + // either approach is acceptable + // + + // + // mask off lanes/components that successfully appended + // + is_active = is_active && !is_append; + + // + // are there any active lanes left? + // + if (!skc_subgroup_any(is_active)) + break; + + // + // There are active lanes that couldn't be appended to a + // bin because their hashes collided with the bin's + // current ryx key then those bins must be ejected. + // + // Note that we do not eject "full" bins because lazily + // waiting for a collision results in simpler code. + // + skc_flush(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + subblocks, + blocks, + blocks_next, + sk_v, + sk_v_next, + sk_extent, + smem, + hash, + yx, + is_active); + } + } + } +} + +// +// INITIALIZE SMEM +// +// Note that SIMD/SIMT have nearly the same syntax. +// +static +void +skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) +{ + // + // initialize smem bins + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); + smem->bin.vN.count = ( 0 ); +#else + // + // SIMT + // + int idx = skc_subgroup_lane(); + +#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) + if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) +#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) + for (; idxbin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); + smem->bin.aN.count[idx] = ( 0 ); + } +#endif +} + +// +// RASTERIZE CUBIC KERNEL +// + +static +void +skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only support perspective later + // + // the affine transformation requires 8 FMA + 2 ROUND operations + // + SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty); + + // + // + // +#if PRINTF_ENABLE + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ + " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ + b0x C,b0y C,t1x C,t1y C, \ + t2x C,t2y C,t3x C,t3y C); + + SKC_RASTERIZE_VECTOR_EXPAND(); + +#else + + printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", + b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); + +#endif + +#endif + + // + // OLD APPROACH + // ------------ + // + // The Spinel CUDA rasterizer was significantly more complex and + // performed a few different tasks that are probably best kept + // separate. + // + // The Spinel rasterizer Bezier held 4-element x and y coordinates + // in adjacent lanes. This simplified intermingling of single lane + // 4-coordinate line segments with two-lane cubic Beziers. + // + // After transformation of the input segments, the Spinel rasterizer + // would test cubics for flatness and, if flat, collapse the + // adjacent lanes into a single line lane and an empty lane. + // + // Any lines would then be appended to a line queue. + // + // Any cubics would then be subdivided. + // + // The reclassification process would be repeated. + // + // NEW APPROACH + // ------------ + // + // Assume we're only working with cubics in this kernel. + // + // Optimization: if the line segment is a special case -- a cusp, + // has 1+ inflections, or a loop -- it might be beneficial to + // subdivide the control cage 1+ times in order to separate the + // flatter segments the high-velocity region(s). + // + // This means we want to split using [a,b] formulation to _directly_ + // subdivide producing a new control cage. + // + // Wang's Formula is still useful even if we subdivide once or twice + // as it's so cheap that it might give some useful hints about where + // the high-velocity sections of curve reside. + // + // But it seems like using Wang's and directly flattening to line + // segments without any subdivision is good enough for the limited + // set of test cases that I've tried. + // + // So... use Wang's Formula to estimate how many line segment are + // required to properly flatten the cubics. + // + // Then use inclusive/exclusive scans to put all the lanes to work: + // + // 1. segmenting cubics to line segments + // + // 2. slivering line segments into 1-pixel high line segments + // + // 3. slivering 1-pixel high line segments into 1-pixel wide line + // segments + // + // MORE BACKGROUND ON NEW APPROACH + // ------------------------------- + // + // Two options for handling line segments: + // + // 1. append the line segments onto an SLM array until enough + // work has been accrued (Spinel does this) + // + // 2. immediately sliver the potentially multi-pixel line + // segments into subpixel lines + // + // The advantage of (1) is that it guarantees the slivering + // process will, on average, always be emitting a full subgroup + // of subpixel lines. + // + // The advantage of (2) is that it reduces code complexity and + // leaves more room for SLM tile bins. The difference between Spinel + // and Skia Compute is that Wang's Formula guarantees there will be + // a full subgroup of multi-pixel lines unless this is the final + // iteration of the warp of multi-pixel lines. + // + // Note that wider GPU architectures might benefit from (1) and + // other work accumulation strategies because it will minimize + // partial warp workloads in the final iteration of each stage. It + // also minimizes the sunk cost of the uniform control logic steps. + // + // So let's implement (2) for now... + // + + // + // And... begin! + // + // Estimate how many line segments are in quad/cubic curve. + // + // Wang's Formula will return zero if the control points are + // collinear but we bump it up to 1.0f. + // + SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); + + // + // if there are free registers then precalculate the reciprocal for + // each estimated segments since it will never change + // + SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); + + + // + // inclusive add scan of estimated line segments + // exclusive add scan of estimated line segments + // total number of estimated line segments + // + SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); + SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; + float s_rem = skc_subgroup_last_float(s_iss); // scalar + + // + // Precompute cubic polynomial coefficients from transformed control + // cage so we can shuffle them in on each iteration of the outer + // loop and then evaluate the polynomial in Horner form. + // + // | 1 0 0 0 | | c0 | + // | | | | + // | -3 3 0 0 | | c1 | + // B(t) = [ 1 t^1 t^2 t^3 ] | | | | + // | 3 -6 3 0 | | c2 | + // | | | | + // | -1 3 -3 1 | | c3 | + // + // + SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL + SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL + + SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL + SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL + + SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB + SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB + + // + // these values don't matter on the first iteration + // + SKC_RASTERIZE_FLOAT l1x_prev = 0; + SKC_RASTERIZE_FLOAT l1y_prev = 0; + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // loop until done + // + while (s_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); + + // + // every lane has a fraction to work off of + // + // FIXME -- this gets expanded on SIMD + // + // if delta == 1 then this is the first lane + // if count == s_segs then this is the last lane + // + SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); + SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); + + SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); + + // + // init parametric t + // + SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? + + // + // if last then override to a hard 1.0f + // + s_t = is_s_last ? 1.0f : s_t; + + // + // decrement by subgroup size + // + s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // now every lane knows what to do and the following lines will + // pump out up to SUBGROUP_SIZE line segments + // + // obtain the src vertices through shared or via a shuffle + // + + // + // shuffle in the polynomial coefficients their source lane + // + SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); + SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); + + SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); + SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); + + SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); + SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); + + SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); + SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); + + // + // calculate "right" line segment endpoint using Horner form + // + SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND + SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); + SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); + + // + // save previous right endpoint + // + l1x_prev = l1x; + l1y_prev = l1y; + + // + // override shuffle up if this is the first line segment + // + l0x = select(l0x,s0x,is_s_first); + l0y = select(l0y,s0y,is_s_first); + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + } + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// RASTERIZE QUAD KERNEL +// + +static +void +skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only support perspective later + // + // the affine transformation requires 8 FMA + 2 ROUND operations + // + SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty); + + // + // Estimate how many line segments are in quad/cubic curve. + // + // Wang's Formula will return zero if the control points are + // collinear but we bump it up to 1.0f. + // + SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); + + // + // if there are free registers then precalculate the reciprocal for + // each estimated segments since it will never change + // + SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); + + + // + // inclusive add scan of estimated line segments + // exclusive add scan of estimated line segments + // total number of estimated line segments + // + SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); + SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; + float s_rem = skc_subgroup_last_float(s_iss); // scalar + + // + // Precompute quadratic polynomial coefficients from control cage so + // we can shuffle them in on each iteration of the outer loop and + // then evaluate the polynomial in Horner form. + // + + // | 1 0 0 | | c0 | + // | | | | + // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | + // | | | | + // | 1 -2 1 | | c2 | + // + // + SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL + SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL + + SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD + SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD + + // + // these values don't matter on the first iteration + // + SKC_RASTERIZE_FLOAT l1x_prev = 0; + SKC_RASTERIZE_FLOAT l1y_prev = 0; + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // loop until done + // + while (s_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); + + // + // every lane has a fraction to work off of + // + // FIXME -- this gets expanded on SIMD + // + // if delta == 1 then this is the first lane + // if count == s_segs then this is the last lane + // + SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); + SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); + + SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); + + // + // init parametric t + // + SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? + + // + // if last then override to a hard 1.0f + // + s_t = is_s_last ? 1.0f : s_t; + + // + // decrement by subgroup size + // + s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // now every lane knows what to do and the following lines will + // pump out up to SUBGROUP_SIZE line segments + // + // obtain the src vertices through shared or via a shuffle + // + + // + // shuffle in the polynomial coefficients their source lane + // + SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); + SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); + + SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); + SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); + + SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); + SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); + + // + // calculate "right" line segment endpoint using Horner form + // + SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND + SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); + SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); + + // + // save previous right endpoint + // + l1x_prev = l1x; + l1y_prev = l1y; + + // + // override shuffle up if this is the first line segment + // + l0x = select(l0x,s0x,is_s_first); + l0y = select(l0y,s0y,is_s_first); + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + } + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// RASTERIZE LINE KERNEL +// + +static +void +skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + +#if 0 + // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y); + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); +#endif + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only + // FIXME -- support perspective later + // + // the affine transformation requires 8 FMA + 4 ROUND operations + // + SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty); + +#if 0 + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); +#endif + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("+cmd_idx = %u\n",cmd_idx); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("-cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("[ %u ]< %u, %u, %u, %u >\n", + cmd_idx, + cmd.nodeword, + SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), + SKC_CMD_RASTERIZE_GET_CLIP(cmd), + SKC_CMD_RASTERIZE_GET_COHORT(cmd)); +#endif + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + switch (tag) + { + case SKC_BLOCK_ID_TAG_PATH_LINE: + skc_rasterize_lines(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_QUAD: + skc_rasterize_quads(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_CUBIC: + skc_rasterize_cubics(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: + break; + case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: + break; + + default: + break; + } +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_lines(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_quads(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_cubics(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + ; +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + ; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl new file mode 100644 index 0000000000..0c7da7d0ad --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl @@ -0,0 +1,144 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "raster.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "raster_builder_cl_12.h" +#include "device_cl_12.h" + +// +// There is a fixed-size meta table per raster cohort that we use to +// peform a mostly coalesced sizing and allocation of blocks. +// +// This code is simple and fast. +// + +__kernel +SKC_RASTERS_ALLOC_KERNEL_ATTRIBS +void +skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t * const map, + __global skc_uint * const metas, + __global skc_uint const * const raster_ids, // FIXME -- CONSTANT + skc_uint const count) +{ + // access to the meta extent is linear + skc_uint const gid = get_global_id(0); + skc_bool const is_active = gid < count; + + // + // init with defaults for all lanes + // + union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } }; + skc_uint raster_id = SKC_UINT_MAX; + skc_uint extra_blocks = 0; + + if (is_active) + { + // load meta_in + meta.in.u32v4 = vload4(gid,metas); + + // load raster_id as early as possible + raster_id = raster_ids[gid]; + +#if 0 + printf("%3u + %5u, %5u, %5u, %5u\n", + gid, + meta.in.blocks, + meta.in.offset, + meta.in.pk, + meta.in.rk); +#endif + + // how many blocks will the ttpb blocks consume? + extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / + SKC_DEVICE_SUBBLOCKS_PER_BLOCK); + + // total keys + meta.out.keys += meta.in.pk; + + // how many blocks do we need to store the keys in the head and trailing nodes? + skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) / + (SKC_RASTER_NODE_DWORDS - 1)); + // increment blocks + extra_blocks += hn; + + // how many nodes trail the head? + meta.out.nodes = hn - 1; + + // update blocks + meta.out.blocks += extra_blocks; + +#if 0 + printf("%3u - %5u, %5u, %5u, %5u\n", + gid, + meta.out.blocks, + meta.out.offset, + meta.out.nodes, + meta.out.keys); +#endif + } + + // + // allocate blocks from block pool + // + // first perform a prefix sum on the subgroup to reduce atomic + // operation traffic + // + // note this idiom can be implemented with vectors, subgroups or + // workgroups + // + + skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks); + skc_uint reads = 0; + + // last lane performs the block pool allocation with an atomic increment + if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) { + reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads + } + + // broadcast block pool base to all lanes + reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1); + + // update base for each lane + reads += prefix - extra_blocks; + + // + // store meta header + // + if (is_active) + { + // store headers back to meta extent + vstore4(meta.out.u32v4,gid,metas); + + // store reads + metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; + + // get block_id of each raster head + skc_block_id_t const block_id = bp_ids[reads & bp_mask]; + + // update map + map[raster_id] = block_id; + +#if 0 + printf("alloc: %u / %u\n",raster_id,block_id); +#endif + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl new file mode 100644 index 0000000000..27411cfe96 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl @@ -0,0 +1,442 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "raster.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "device_cl_12.h" + +// +// +// + +#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS) + +#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS) + +// +// +// + +#if ( SKC_RASTERS_RECLAIM_X == 1 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_RASTERS_RECLAIM_X == 2 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_RASTERS_RECLAIM_X == 4 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_RASTERS_RECLAIM_X == 8 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_RASTERS_RECLAIM_X == 16) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_RASTERS_RECLAIM_X" +#endif + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \ + (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) + +#endif + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// COMPILE-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I) + +#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I) + +// +// RUN-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \ + (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS) + +// +// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL +// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK +// COMBOS (NOT NECESSARILY POW2) +// +// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR +// UINT TYPE INSTEAD OF A ULONG. +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint + +// +// +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ + (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ + ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ + S = sub_group_scan_exclusive_add(C) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \ + (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK) + +// +// +// + +struct skc_reclaim +{ + skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE]; +}; + +__kernel +SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS +void +skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring + __global skc_uint * const bp_elems, // block pool blocks + __global skc_uint volatile * const bp_atomics, // read/write atomics + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const map, // raster host-to-device map + struct skc_reclaim const reclaim) // array of host raster ids +{ +#if (__OPENCL_VERSION__ < 200) + skc_uint const reclaim_stride = get_num_sub_groups(); +#else + skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); + +#if 0 + // + // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT + // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL + // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE + // RECLAMATION JOB ON THE REST OF THE PIPELINE. + // + for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) +#endif + { + // get host raster id + skc_raster_h const raster = reclaim.aN[reclaim_idx]; + + // get block id of raster header + skc_block_id_t id = map[raster]; + + // + // load all of the head block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + // load raster header counts -- we only need the blocks and + // nodes words the keys are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes); + } +#endif + // + // acquire a span in the block pool ids ring for reclaimed ids + // + skc_uint bp_ids_base = 0; + + if (get_sub_group_local_id() == 0) { + bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); + } + + bp_ids_base = sub_group_broadcast(bp_ids_base,0); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + h##I = h##I & SKC_TTXK_LO_MASK_ID; \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",h##I,h##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + printf("%08X\n",h##I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // - we'll skip subgroups that are entirely header + // + // - but we need to mark any header elements that partially fill + // a subgroup as subblocks + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \ + h##I = SKC_UINT_MAX; \ + } \ + } \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + { + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = h##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + } + + // printf("R %7u ! %u\n",bp_ids_idx,h##I); + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, walk the nodes + // + do { + // id of next block is in last lane + id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); + + // + // load all of the node block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + n##I = n##I & SKC_TTXK_LO_MASK_ID; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",n##I,n##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = n##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // printf("R %7u ! %u\n",bp_ids_idx,n##I); + + // any more nodes? + } while (--count_nodes > 0); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl new file mode 100644 index 0000000000..9205334940 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/render.cl @@ -0,0 +1,2165 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "styling_types.h" +#include "atomic_cl.h" +#include "device_cl_12.h" + +// +// +// + +#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) + +// +// +// + +#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 +#endif + +// +// tile state flag bits +// + +typedef enum skc_tile_flags_e { + + // FLUSH + SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, + SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, + SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, + + // OPACITY + SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, + + // + // Note: testing for opacity and skipping scattering is on its way + // to becoming a much more programmable option because sometimes we + // may be compositing/blending from back-to-front and/or be using + // group blend rules that ignore opacity. + // + // The point is that all of these decisions should be encoded in + // styling commands and, as much as possible, removed from the final + // group/layer styling traversal render loop. + // + +} skc_tile_flags_e; + +// +// COVER -- assumes availability of either fp16 or fp32 +// + +union skc_tile_cover +{ + struct { + SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; + } aN; + +#ifdef SKC_RENDER_TILE_COVER_VECTOR + struct { + SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; + } vN; +#endif +}; + +// +// COLOR -- assumes availability of either fp16 or fp32 +// + +union skc_tile_color +{ + union { + struct { + SKC_RENDER_TILE_COLOR r; + SKC_RENDER_TILE_COLOR g; + SKC_RENDER_TILE_COLOR b; + SKC_RENDER_TILE_COLOR a; + } rgba[SKC_TILE_WIDTH]; + } aN; + +#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED + union { + SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; + } iN; +#endif + +#ifdef SKC_RENDER_TILE_COLOR_VECTOR + union { + SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; + } vN; +#endif + + struct { + union { + struct { + SKC_RENDER_TILE_COLOR r; + SKC_RENDER_TILE_COLOR g; + }; + SKC_RENDER_GRADIENT_FLOAT distance; + }; + union { + struct { + SKC_RENDER_TILE_COLOR b; + SKC_RENDER_TILE_COLOR a; + }; + SKC_RENDER_GRADIENT_FLOAT stoplerp; + }; + } grad[SKC_TILE_WIDTH]; +}; + +// +// SHARED MEMORY STATE +// + +#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) + +#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) +#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) + +// +// +// + +union skc_subgroup_smem +{ + // + // The tiles are stored in column-major / height-major order + // + // The final column is a guard column that is OK to write to but + // will never be read. It simplifies the TTSB scatter but could be + // predicated if SMEM is really at a premium. + // +#if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) + struct { + SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] + } atomic; +#endif + + struct { + int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] + } aN; + + struct { // assumption is that height = subgroup + SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; + } vN; + + struct { // assumption is that height = subgroup + SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; + } wide; + + union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; + + half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; + +#if 0 + // + // SPILL TO GMEM + // +#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) + struct { + +#if (SKC_REGS_COLOR_S > 0) + union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; +#endif + +#if (SKC_REGS_COVER_S > 0) + union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; +#endif + + } regs; +#endif + // + // + // +#endif +}; + +// +// +// + +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + +#define skc_subgroup_lane() 0 + +#else + +#define skc_subgroup_lane() get_sub_group_local_id() + +#endif + +// +// +// + +typedef skc_uint skc_ttsk_lo_t; +typedef skc_uint skc_ttsk_hi_t; + +typedef skc_uint skc_ttpk_lo_t; +typedef skc_uint skc_ttpk_hi_t; + +typedef skc_uint skc_ttxk_lo_t; +typedef skc_uint skc_ttxk_hi_t; + +typedef skc_uint skc_ttck_lo_t; +typedef skc_uint skc_ttck_hi_t; + +typedef skc_uint2 skc_ttck_t; + +typedef skc_int skc_ttxb_t; + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +static +skc_uint +skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) +{ + return a & SKC_TTCK_LO_MASK_ID; +} + +static +skc_layer_id +skc_ttck_get_layer(skc_ttck_t const a) +{ + // + // FIXME -- a union with a ulong and a shift down and mask is + // probably faster on some architectures + // + skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); + skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; + + return lo | hi; +} + +static +skc_uint +skc_ttck_hi_get_x(skc_ttck_hi_t const a) +{ + return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); +} + +static +skc_uint +skc_ttck_hi_get_y(skc_ttck_hi_t const a) +{ + return a >> SKC_TTCK_HI_OFFSET_Y; +} + +static +skc_bool +skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) +{ + skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); + skc_uint const hi = (a.hi ^ b.hi); + + return (lo | hi) == 0; +} + +static +skc_bool +skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) +{ + return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; +} + +static +skc_bool +skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) +{ + return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; +} + +// +// TILE TRACE SUBPIXEL +// +// The subpixels are encoded with either absolute tile coordinates +// (32-bits) or packed in delta-encoded form form. +// +// For 32-bit subpixel packing of a 32x32 tile: +// +// A tile X is encoded as: +// +// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. +// +// SX : 6 : unsigned subpixel span from min to max x with range +// [0,32]. The original direction is not captured. Would +// be nice to capture dx but not necessary right now but +// could be in the future. <--- SPARE VALUES AVAILABLE +// +// A tile Y is encoded as: +// +// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. +// +// DY : 6 : signed subpixel delta y1-y0. The range of delta is +// [-32,32] but horizontal lines are not encoded so [1,32] +// is mapped to [0,31]. The resulting range [-32,31] fits +// in 6 bits. +// +// TTS: +// +// 0 31 +// | TX | SX | TY | DY | +// +-----+------+-----+------+ +// | 10 | 6 | 10 | 6 | +// + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) +{ + // + // extract the whole pixel y coordinate + // + return SKC_BFE(a, + SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, + SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); +} + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) +{ + // + // get the linear array tile index of the pixel + // + return (((a & SKC_TTS_MASK_TX_PIXEL) + +#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) + >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) +#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) + << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) +#endif + + ) | skc_tts_get_ty_pixel_v(a)); +} + +#if 0 +static +skc_ttx_v_s32_t +skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) +{ + skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; + + return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); +} +#else +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) +{ + SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; + + return dy - (~a >> 31); +} +#endif + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) +{ + return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); +} + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) +{ + return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); +} + +// +// +// + +static +void +skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) +{ + // + // SIMD / CPU + // + // & + // + // SIMT / GPU + // + // Note that atomic_init() is likely implemented as a simple + // assignment so there is no identifiable performance difference on + // current targets. + // + // If such an architecture appears in the future then we'll probably + // still want to implement this zero'ing operation as below but + // follow with an appropriate fence that occurs before any scatter + // operations. + // + // The baroque expansion below improves performance on Intel GEN by, + // presumably, achieving the 64-byte per clock SLM write as well as + // minimizing the overall number of SEND() block initializations and + // launches. + // + // Intel GENx has a documented 64 byte per cycle SLM write limit. + // So having each lane in an 8 lane subgroup zero-write 8 bytes is + // probably a safe bet (Later: benchmarking backs this up!). + // + // Note there is no reason at this time to unroll this loop. + // + for (uint ii=0; iiwide.area[ii][skc_subgroup_lane()] = ( 0 ); +} + +// +// Note this is going to be vectorizable on most architectures. +// +// The return of the key translation feature might complicate things. +// + +static +void +skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, + __local union skc_subgroup_smem * SKC_RESTRICT const smem, + skc_block_id_t const pb_id) +{ + skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); + +#if ( SKC_TILE_RATIO == 1 ) + + SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; + +#elif ( SKC_TILE_RATIO == 2 ) + + SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); + +#else + +#error("tile ratio greater than 2 not supported") + +#endif + + // + // Note there is no need to use an atomic for this operation on the + // current group of target platforms... but this may change if + // atomic ops truly go through a different path. + // + // As noted above, this direct increment is probably faster and can + // always be followed by a fence. + // + // Furthermore, note that the key sorting orders all ttck keys + // before ttpk keys. + // + + // + // FIXME -- if the SMEM store is wider than bank word count then we + // might want to odd-even interleave the TTP values if the target + // device can't handle 64-bit stores + // + + // + // skipping per-key translation for now + // + smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); +} + +// +// Note that skc_scatter_ttsb is *not* vectorizable unless the +// architecture supports a "scatter-add" capability. All relevant +// GPUs support atomic add on shared/local memory and thus support +// scatter-add. +// + +static +void +skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, + __local union skc_subgroup_smem * SKC_RESTRICT const smem, + skc_block_id_t const sb_id) +{ + skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + + SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; + + // + // Skipping per-key translation for now + // + + // Index into tile + // + // The tiles are stored in column-major / height-major order + // + // The final column is a guard column that is OK to write to but + // will never be read. It simplifies the TTSB scatter but could be + // predicated if SMEM is really at a premium. + // + + SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); + +#if 0 + if (tts_v != SKC_TTS_INVALID) + printf("(%08X) = %u\n",tts_v,xy_idx); +#endif + + // + // adjust subpixel range to max y + // + // range is stored as [-32,31] and when read [0,31] is mapped to + // [1,32] because a dy of 0 is not possible. + // + // more succinctly: if dy >= 0 then ++dy + // + SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); + + // + // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? + // + + // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" + SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); + + // Calculate left and right coverage contribution trapezoids + SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; + SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; + + // + // Accumulate altitudes and areas + // + // Optimization: if the device supports an CPU/SIMD vector-add or + // GPU/SIMT scatter-add atomic int2 add operation then placing the + // ALT and AREA values side-by-side would halve the number of + // additions. + // +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + // + // CPU/SIMD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ + smem->aN.area[ xy_idx C] += right C; \ + } + +#else + // + // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ + SKC_TILE_HEIGHT + xy_idx C, \ + left C); \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ + right C); \ + } +#endif + + SKC_RENDER_TTSB_EXPAND(); +} + +// +// Note that 2048.0 can be represented exactly with fp16... fortuitous! +// + +#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) +#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) +#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) +#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) + +// +// +// + +static +void +skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + union skc_tile_cover * SKC_RESTRICT const cover, + union skc_tile_color * SKC_RESTRICT const color) +{ + SKC_RENDER_ACC_COVER_INT area = 0; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 + for (uint ii=0; iivN.area[ii][skc_subgroup_lane()]; + SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); + SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); + + cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); + } +} + +static +void +skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + union skc_tile_cover * SKC_RESTRICT const cover, + union skc_tile_color * SKC_RESTRICT const color) +{ + SKC_RENDER_ACC_COVER_INT area = 0; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 + for (uint ii=0; iivN.area[ii][skc_subgroup_lane()]; + SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); + SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); + + cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; + } +} + +// +// +// + +static +void +skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color) +{ + // + // rgba = solid fill + // + __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; + + *cmd_next += 2; + +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; iiaN.rgba[ii].r = rg.lo; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; iiaN.rgba[ii].g = rg.hi; + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; iiaN.rgba[ii].b = ba.lo; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; iiaN.rgba[ii].a = ba.hi; + +#else + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + SKC_RENDER_TILE_COLOR const r = rg.lo; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); + + SKC_RENDER_TILE_COLOR const g = rg.hi; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + SKC_RENDER_TILE_COLOR const b = ba.lo; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); + + SKC_RENDER_TILE_COLOR const a = ba.hi; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); + +#endif +} + +// +// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" +// +// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ +// +// Lerp in two fma/mad ops: +// +// t * b + ((-t) * a + a) +// +// Note: OpenCL documents mix() as being implemented as: +// +// a + (b - a) * t +// +// But this may be a native instruction on some devices. For example, +// on GEN9 there is an LRP "linear interoplation" function but it +// doesn't appear to support half floats. +// + +#if 1 +#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) +#else +#define SKC_LERP(a,b,t) mix(a,b,t) +#endif + +// +// CPUs have a mock local address space so copying the gradient header +// is probably not useful. Just read directly from global. +// + +#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL +#define SKC_RENDER_GRADIENT_SPACE __local +#else +#define SKC_RENDER_GRADIENT_SPACE __global +#endif + +// +// gradient is non-vertical +// +// removed the vertical (actually, horizontal) special case +// + +static +void +skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + __global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // Where is this tile? + // + // Note that the gradient is being sampled from pixel centers. + // + SKC_RENDER_GRADIENT_FLOAT const y = +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P + (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + + (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); + + float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); + + // + // Get starting numerator and denominator + // + // Note: if gh[0].dx is exactly 0.0f then this is a vertical + // gradient and can be handled by a special opcode. + // + // Note: the mad() ordering is slightly different than the original + // CUDA implementation. + // + union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; + + *cmd_next += 4; + + float const gv_x_dot = mad(x,gv.dx,gv.p0); + SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); + + // + // Where are columns along gradient vector? + // + // TODO: Note that the gv_denom isn't multiplied through. + // + // Please doublecheck this... but I recall that in certain cases + // this wipes out some precision and results in minor but noticeable + // gradient artifacts. + // + // All arguments are scalars except gv_numer so a simpler + // evaluation might save some flops. + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; + + // + // is gradient non-repeating, repeating or reflecting? + // + switch (commands[(*cmd_next)++].u32) + { + case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); + break; + + case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].distance -= floor(color->grad[ii].distance); + break; + + default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING + // + // OPTIMIZATION: Can this be done in fewer than ~4 ops? + // + // Note: OpenCL "rint()" is round-to-nearest-even integer! + // + // Note: the floor() "round to -inf" op is implemented in the + // GEN op 'FRC' so probably don't use trunc() when floor will + // suffice. + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].distance); + color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); + } + } + + // + // initialize "stoplerp" for all columns + // + uint const slope_count = commands[(*cmd_next)++].u32; + uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME + + { + float const slope = commands[(*cmd_next)++].f32; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].stoplerp = color->grad[ii].distance * slope; + } + + // + // compute stoplerp for remaining stops + // + for (int jj=1; jjgrad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); + } + + // + // copy gradient colors to local memory + // + uint const gd_n = slope_count + 1; + +#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL + // + // copy entire gradient descriptor to local memory + // + for (uint ii=skc_subgroup_lane(); iicmds[ii].u32 = commands[*cmd_next + ii].u32; + + __local half const * const SKC_RESTRICT gc = smem->gc + 0; +#else + // + // prefetch entire gradient header + // + // no noticeable impact on performance + // + // prefetch(&commands[*cmd_next].u32,gh_words); + // + __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; +#endif + + // + // adjust cmd_next so that V1 structure is consumed -- FIXME + // + *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); + + // + // lerp between color pair stops + // + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iigrad[ii].stoplerp); + SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); + + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); + } + } +} + +// +// +// + +static +void +skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // fralunco = cover.wip * acc.a + // + // acc.r = fralunco * wip.r + acc.r + // acc.g = fralunco * wip.g + acc.g + // acc.b = fralunco * wip.b + acc.b + // acc.a = -fralunco * wip.a + acc.a + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] * color_acc->aN.rgba[ii].a; + + color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // cover_min = min(cover.wip,a.acc) + // + // r.acc = cover_min * r.wip + r.acc + // g.acc = cover_min * g.wip + g.acc + // b.acc = cover_min * b.wip + b.acc + // a.acc = -cover_min * a.wip + a.acc + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii],color_acc->aN.rgba[ii].a); + + color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // r.acc = (cover.wip * r.wip) * r.acc + // g.acc = (cover.wip * g.wip) * g.acc + // b.acc = (cover.wip * b.wip) * b.acc + // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; + color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; + color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; + color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; + } +} + +// +// +// + +static +void +skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, + union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // cover.wip.contrib = (1.0 - cover.acc) * cover.wip + // cover.acc = cover.acc + cover.wip.contrib + // + // r.acc = cover.wip.contrib * r.wip + r.acc + // g.acc = cover.wip.contrib * g.wip + g.acc + // b.acc = cover.wip.contrib * b.wip + b.acc + // a.acc = -cover.wip.contrib * a.wip * a.acc + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii]) * cover_wip->aN.c[ii]; + + cover_acc->aN.c[ii] += contrib; + + color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, + union skc_tile_cover const * SKC_RESTRICT const cover_wip) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = cover_wip->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = cover_wip->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, + union skc_tile_cover const * SKC_RESTRICT const cover_acc) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = cover_acc->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) + for (uint ii=0; iivN.c[ii] = cover_acc->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip) +{ + // + // cover.wip.contrib = (1.0 - cover.acc) * cover.wip + // cover.acc = cover.acc + cover.wip.contrib + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); +} + +// +// +// + +static +void +skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, + union skc_tile_cover const * SKC_RESTRICT const cover_msk) +{ + // + // cover.wip *= cover.msk + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] *= cover_msk->aN.c[ii]; +} + +// +// +// + +static +void +skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = 0; + +#endif +} + +static +void +skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = 0; + +#endif +} + +static +void +skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = 0; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = 1; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.c[ii] = 1 - cover->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; iivN.c[ii] = 1 - cover->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) +{ +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].r = 0; + color->aN.rgba[ii].g = 0; + color->aN.rgba[ii].b = 0; + color->aN.rgba[ii].a = 1; + } + +#else + // + // DISABLED ON GEN9 -- probably a compiler bug + // + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.odd = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.odd = 1; +#endif +} + +static +void +skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) +{ +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].r = 0; + color->aN.rgba[ii].g = 0; + color->aN.rgba[ii].b = 0; + color->aN.rgba[ii].a = 1; + } + +#else + // + // DISABLED ON GEN9 -- probably a compiler bug + // + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].even.odd = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; iivN.rgba[ii].odd.odd = 1; +#endif +} + +// +// +// + +static +bool +skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) +{ + // + // returns true if tile is opaque + // + // various hacks to test for complete tile opacity + // + // note that front-to-back currently has alpha at 0.0f -- this can + // be harmonized to use a traditional alpha if we want to support + // rendering in either direction + // + // hack -- ADD/MAX/OR all alphas together and test for non-zero + // + SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=1; iiaN.rgba[ii].a; + +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + return !any(t != ( 0 )); + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) + // + // SIMT - scalar per lane + // + return !sub_group_any(t != 0); + +#else + // + // SIMT - vector per lane + // + return !sub_group_any(any(t != ( 0 ))); + +#endif + + // + // TODO: The alternative vector-per-lane implementation below is + // *not* believed to be performant because the terse vector-wide + // test is just hiding a series of comparisons and is likely worse + // than the blind ADD/MAX/OR'ing of all alphas followed by a single + // test. + // +#if 0 + // + // SIMT - vector per lane + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) + for (uint ii=0; iivN.ba[ii].a != ( 0 )))) + return false; + } + + return true; +#endif +} + +// +// +// + +static +void +skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color) +{ + // + // acc.r = acc.a * r + acc.r + // acc.g = acc.a * g + acc.g + // acc.b = acc.a * b + acc.b + // + __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; + + *cmd_next += 2; + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); +} + +// +// +// + +// #define SKC_SURFACE_IS_BUFFER +#ifdef SKC_SURFACE_IS_BUFFER + +static +void +skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, + skc_uint const surface_pitch, + union skc_tile_color const * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // NEW MAJOR OPTIMIZATION: + // + // Rotating and rasterizing the original world transform by -90 + // degrees and then rendering the scene scene by +90 degrees enables + // all the final surface composite to be perfomed in perfectly + // coalesced wide transactions. + // + // For this reason, linear access to the framebuffer is preferred. + // + // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv + // + // NOTE THIS IS TRANSPOSED BY 90 DEGREES + // + // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE + // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. + // + // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS + // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS + // + // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL + // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER + // + uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; + uint const x = skc_ttck_hi_get_x(ttck_hi); + uint const y = skc_ttck_hi_get_y(ttck_hi) ; + uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiaN.rgba[ii].r * 255); + rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; + rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; + + surface[base + ii * pitch] = rgba; + + // printf("%08v2X\n",rgba); + } +} + +#else + +static +void +skc_surface_composite_u8_rgba(__write_only image2d_t surface, + union skc_tile_color const * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // NEW MAJOR OPTIMIZATION: + // + // Rotating and rasterizing the original world transform by -90 + // degrees and then rendering the scene scene by +90 degrees enables + // all the final surface composite to be perfomed in perfectly + // coalesced wide transactions. + // + // For this reason, linear access to the framebuffer is preferred. + // + // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv + // + // NOTE THIS IS TRANSPOSED BY 90 DEGREES + // + // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE + // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. + // + // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS + // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS + // + // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL + // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER + // + +#if 1 + int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; + int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiiN.rgba[ii] A); \ + } + +#else + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_COLOR const rgba = \ + (SKC_RENDER_SURFACE_COLOR) \ + (color->aN.rgba[ii].r C, \ + color->aN.rgba[ii].g C, \ + color->aN.rgba[ii].b C, \ + 1.0); \ + SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ + } + +#endif + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + x += 1; + } +#else + int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); + int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; iiiN.rgba[ii] A); \ + } + +#else + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_COLOR const rgba = \ + (SKC_RENDER_SURFACE_COLOR) \ + (color->aN.rgba[ii].r C, \ + color->aN.rgba[ii].g C, \ + color->aN.rgba[ii].b C, \ + 1.0); \ + SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ + } + +#endif + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + } + +#endif +} + +#endif + +// +// +// +static +uint const +skc_ttck_lane(uint const ttck_idx) +{ + return ttck_idx & SKC_RENDER_SUBGROUP_MASK; +} + +// +// RENDER KERNEL +// + +__kernel +SKC_RENDER_KERNEL_ATTRIBS +void +skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, + __global struct skc_group_node const * SKC_RESTRICT const groups, + __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename + + __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys + skc_uint const ttck_count, // rename: key_count + + __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets + skc_uint const tile_count, // rename: offset_count + + __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, +#ifdef SKC_SURFACE_IS_BUFFER + __global void * SKC_RESTRICT const surface, +#else + __write_only image2d_t surface, +#endif +#ifdef SKC_SURFACE_IS_BUFFER + skc_uint const surface_pitch, +#endif + uint4 const tile_clip) // rename: clip +{ + // + // Each subgroup is responsible for a tile. No extra subgroups are + // launched. + // + // FIXME -- might be better implemented as a "grid stride loop" if + // Intel GEN really has a local memory "quantum" of 4KB which means + // we would need to launch 4 subgroups per workgroup. + // + // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. + // + + // + // declare tile cover and color registers + // + // this used to be a neat unified struct but the Intel GEN compiler + // wasn't cooperating and spilling to private memory even though all + // registers were indexed by constants + // + union skc_tile_color color_wip; + union skc_tile_color color_acc; + + union skc_tile_cover cover_wip; + union skc_tile_cover cover_acc; + union skc_tile_cover cover_msk; + + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) + // as a uniform but the alternative calculation used when there are + // multiple subgroups per workgroup is not cooperating and + // driving spillage elsewhere. + // +#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const ttck_offset_idx = get_group_id(0); +#else + skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // load the starting ttck for this offset and get a bound on the max + // number of keys that might be loaded + // + // these are uniform across all subgroup lanes + // + skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; + + // + // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) + // vector of ttck keys + // +#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + + skc_ttck_t ttck = ttck_keys[ttck_idx]; + +#else + + uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; + uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; + skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] + +#endif + + // + // set up style group/layer state + // + struct skc_styling_group { + union skc_group_range range; + skc_uint depth; + skc_uint id; + } group; + + group.range.lo = 0; + group.range.hi = SKC_UINT_MAX; + group.depth = 0; + group.id = SKC_UINT_MAX; + + // + // start with clear tile opacity, knockout and flag bits + // + // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 + // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 + // + skc_uint flags = 0; + + // + // declare and initialize accumulators + // +#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) + __local union skc_subgroup_smem smem[1]; +#else + __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; + __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); +#endif + +#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + // + // select the initial ttck key + // + skc_ttck_t ttck; +#if 0 + ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN +#else + ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND + ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); +#endif + +#endif + + // + // save the first key so we know what tile we're in + // + skc_ttck_t ttck0 = ttck; + + // + // evaluate the coarse clip as late as possible + // + skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); + + if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) + return; + + skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); + + if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) + return; + +#if 0 + printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); +#endif + + // + // load -> scatter -> flush + // + while (true) + { + // if scattering is disabled then just run through ttck keys + bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; + + // need to clear accumulators before a scatter loop + if (is_scatter_enabled) + { + skc_tile_aa_zero(smem); + } + + do { + // skip scattering? + if (is_scatter_enabled) + { + skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); + + if (skc_ttck_lo_is_prefix(ttck.lo)) { + skc_scatter_ttpb(ttxb_extent,smem,xb_id); + } else { + skc_scatter_ttsb(ttxb_extent,smem,xb_id); + } + } + + // + // any ttck keys left? + // + if (++ttck_idx >= ttck_count) + { + flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; + break; + } + + // + // process next ttck key + // +#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + // + // SIMD -- read next key + // + ttck = ttck_keys[ttck_idx]; +#else + // + // SIMT -- refresh the ttck_s? + // + uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; + + if (ttck_lane_next == 0) + ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; + + // + // broadcast next key to entire subgroup + // +#if 0 + ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN +#else + ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND + ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); +#endif +#endif + // continue scattering if on same YXL layer + } while (skc_ttck_equal_yxl(ttck0,ttck)); + + // finalize if no longer on same YX tile + if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) + { + // otherwise, unwind the tile styling and exit + flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; + } + + // + // given: new layer id from ttxk key + // + // load [layer id]{ group id, depth } + // + // if within current group's layer range + // + // if at same depth + // + // load and execute cover>[mask>]color>blend commands + // + // else if not at same depth then move deeper + // + // for all groups in group trail from cur depth to new depth + // enter group, saving and initializing regs as necessary + // increment depth and update layer range + // load and execute cover>[mask>]color>blend commands + // + // else not within layer range + // + // exit current group, restoring regs as necessary + // decrement depth and update layer range + // + // + skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi + union skc_layer_node const layer_node_new = layers[layer_id_new]; + + // clear flag that controls group/layer traversal + flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; + + do { + bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; + + // + // is layer a child of the current parent group? + // + uint cmd_next = 0; + + if (!unwind && (layer_node_new.parent == group.id)) + { + // execute this layer's cmds + cmd_next = layer_node_new.cmds; + + // if this is final then configure so groups get unwound, otherwise we're done + flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE); + } + else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) + { + // + // is layer in a child group? + // + union skc_group_parents const gp = groups[layer_node_new.parent].parents; + uint const gn = gp.depth - ++group.depth; + + if (gn == 0) + group.id = layer_node_new.parent; + else + group.id = commands[gp.base + gn - 1].parent; + + // update group layer range + group.range = groups[group.id].range; + + // enter current group + cmd_next = groups[group.id].cmds.enter; + } + else // otherwise, exit this group + { + // enter current group + cmd_next = groups[group.id].cmds.leave; + + // decrement group depth + if (--group.depth == 0) + { + flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; + } + else + { + // get path_base of current group + uint const gnpb = groups[group.id].parents.base; + + // get parent of current group + group.id = commands[gnpb].parent; + + // update group layer range + group.range = groups[group.id].range; + } + } + + // + // execute cmds + // + while (true) + { + union skc_styling_cmd const cmd = commands[cmd_next++]; + + switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) + { + case SKC_STYLING_OPCODE_NOOP: + break; + + case SKC_STYLING_OPCODE_COVER_NONZERO: + skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_EVENODD: + skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACCUMULATE: + skc_tile_cover_accumulate(&cover_acc,&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_MASK: + skc_tile_cover_wip_mask(&cover_wip,&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_WIP_ZERO: + skc_tile_cover_wip_zero(&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACC_ZERO: + skc_tile_cover_acc_zero(&cover_acc); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_ZERO: + skc_tile_cover_msk_zero(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_ONE: + skc_tile_cover_msk_one(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_INVERT: + skc_tile_cover_msk_invert(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: + skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); + break; + + case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: + // + // FIXME -- gradients shouldn't be executing so much + // conditional driven code at runtime since we *know* + // the gradient style on the host can just create a + // new styling command to exploit this. + // + // FIXME -- it might be time to try using the GPU's + // sampler on a linear array of half4 vectors -- it + // might outperform the explicit load/lerp routines. + // + // FIXME -- optimizing for vertical gradients (uhhh, + // they're actually horizontal due to the -90 degree + // view transform) is nice but is it worthwhile to + // have this in the kernel? Easy to add it back... + // +#if defined( SKC_ARCH_GEN9 ) + // disable gradients due to exessive spillage -- fix later + cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); +#else + skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); +#endif + break; + + case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: + skc_tile_color_wip_zero(&color_wip); + break; + + case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: + skc_tile_color_acc_zero(&color_acc); + break; + + case SKC_STYLING_OPCODE_BLEND_OVER: + skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_PLUS: + skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_MULTIPLY: + skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: + skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: + // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: + // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); + break; + + case SKC_STYLING_OPCODE_BACKGROUND_OVER: + skc_tile_background_over(commands,&cmd_next,&color_acc); + break; + + case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: +#ifdef SKC_SURFACE_IS_BUFFER + skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); +#else + skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); +#endif + break; + + case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: + if (skc_tile_color_test_opacity(&color_acc)) + flags |= SKC_TILE_FLAGS_SCATTER_SKIP; + break; + + default: + return; // this is an illegal opcode -- trap and die! + } + + // + // if sign bit is set then this was final command + // + if (cmd.s32 < 0) + break; + } + + // continue as long as tile flush isn't complete + } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); + + // return if was the final flush + if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) + return; + + // update wip ttck_hi + ttck0 = ttck; + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl new file mode 100644 index 0000000000..378d51d8d7 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl @@ -0,0 +1,130 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE +// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS +// KERNEL WILL NEED TO BE UPDATED +// + +#include "tile.h" +#include "atomic_cl.h" +#include "device_cl_12.h" + +// +// +// + +#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) +#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) + +// +// +// + +#define SKC_YX_NEQ(row,prev) \ + (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) + +// +// +// + +__kernel +__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +void +skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, + __global uint * SKC_RESTRICT const indices, + __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) +{ + uint const global_id = get_global_id(0); + uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); + uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + + // + // LOAD ALL THE ROWS + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + + HS_SLAB_ROWS(); + + // + // LOAD LAST REGISTER FROM COLUMN TO LEFT + // + uint diffs = 0; + uint2 r0 = r1; + + if (gmem_base > 0) { + // if this is the first key in any slab but the first then it + // broadcast loads the last key in previous slab + r0.hi = as_uint2(vout[gmem_base - 1]).hi; + } else if (get_sub_group_local_id() == 0) { + // if this is the first lane in the first slab + diffs = 1; + } + + // now shuffle in the last key from the column to the left + r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); + + // + // FIND ALL DIFFERENCES IN SLAB + // + uint valid = 0; + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + valid |= ((r##row != SKC_ULONG_MAX) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + diffs |= (SKC_YX_NEQ(row,prev) << prev); + + HS_SLAB_ROWS(); + + // + // SUM UP THE DIFFERENCES + // + uint const valid_diffs = valid & diffs; + uint const count = popcount(valid_diffs); + uint const inclusive = sub_group_scan_inclusive_add(count); + uint const exclusive = inclusive - count; + + // + // RESERVE SPACE IN THE INDICES ARRAY + // + uint next = 0; + + if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) + next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset + + // distribute base across subgroup + next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); + + // + // STORE THE INDICES + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (valid_diffs & (1 << prev)) \ + indices[next++] = lane_idx + prev; + + HS_SLAB_ROWS(); + + // + // TRANSPOSE THE SLAB AND STORE IT + // + HS_TRANSPOSE_SLAB(); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl new file mode 100644 index 0000000000..e9accde307 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl @@ -0,0 +1,394 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE +// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS +// KERNEL WILL NEED TO BE UPDATED +// + +#include "tile.h" +#include "raster_builder_cl_12.h" // need meta_in structure +#include "device_cl_12.h" + +// +// +// + +#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) +#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) + +// +// THE BEST TYPE TO ZERO SMEM +// + +#define SKC_ZERO_TYPE ulong +#define SKC_ZERO_WORDS 2 + +// +// THE ORDER OF COMPONENTS IS: +// +// 0: blocks +// 1: offset +// 2: pk +// 3: rk +// + +#if (HS_KEYS_PER_SLAB < 256) + +#define SKC_META_TYPE uint +#define SKC_META_WORDS 1 + +#define SKC_COMPONENT_TYPE uchar + +#else + +#define SKC_META_TYPE uint2 +#define SKC_META_WORDS 2 + +#define SKC_COMPONENT_TYPE ushort + +#endif + +// +// +// + +#if ( SKC_TTRK_HI_BITS_COHORT <= 8) +#define SKC_COHORT_TYPE uchar +#else +#define SKC_COHORT_TYPE ushort +#endif + +// +// +// + +#define SKC_COHORT_ID(row) \ + as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT + +// +// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED +// + +#define SKC_IS_BLOCK(row) \ + ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + +#define SKC_YX(row,prev) \ + (as_uint2(r##row).hi ^ as_uint2(r##prev).hi) + +#define SKC_IS_PK(row,prev) \ + ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X) + +// +// COHORT SIZE IS ALWAYS A POWER-OF-TWO +// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO +// +// COHORT SIZE >= SUBGROUP SIZE +// + +#define SKC_COHORT_SIZE (1<> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); + uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + + // + // LOAD ALL THE ROWS + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + + HS_SLAB_ROWS(); + + // + // LOAD LAST REGISTER FROM COLUMN TO LEFT + // + uint diffs = 0; + uint2 r0 = 0; + + if (gmem_base > 0) { + // if this is the first key in any slab but the first then it + // broadcast loads the last key in previous slab + r0.hi = as_uint2(vout[gmem_base - 1]).hi; + } else { + // otherwise broadcast the first key in the first slab + r0.hi = sub_group_broadcast(as_uint2(r1).hi,0); + // and mark it as an implicit diff + if (get_sub_group_local_id() == 0) + diffs = 1; + } + + // now shuffle in the last key from the column to the left + r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); + + // shift away y/x + SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT; + + // + // EXTRACT ALL COHORT IDS EARLY... + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row); + + HS_SLAB_ROWS(); + + // + // DEBUG + // +#if 0 + if (gmem_base == HS_KEYS_PER_SLAB * 7) + { + if (get_sub_group_local_id() == 0) + printf("\n%llX ",as_ulong(r0)); + else + printf("%llX ",as_ulong(r0)); +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (get_sub_group_local_id() == 0) \ + printf("\n%llX ",r##row); \ + else \ + printf("%llX ",r##row); + + HS_SLAB_ROWS(); + } +#endif + + // + // CAPTURE ALL CONDITIONS WE CARE ABOUT + // + // Diffs must be captured before cohorts + // + uint valid = 0; + uint blocks = 0; + uint pks = 0; + SKC_COHORT_TYPE c_max = 0; + + // + // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN + // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE + // +#if 0 + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + diffs |= ((c##row != c##prev) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + blocks |= (SKC_IS_BLOCK(row) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + pks |= SKC_IS_PK(row,prev) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + valid |= ((r##row != SKC_ULONG_MAX) << prev); + + HS_SLAB_ROWS(); + +#else + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (c##row != c##prev) \ + diffs |= 1<>HS_LANES_PER_WARP_LOG2,c_min,c_max); +#endif + + // + // ZERO SMEM + // + // zero only the meta info for the cohort ids found in this slab + // +#if (SKC_ZERO_WORDS >= SKC_META_WORDS) + uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id(); + uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO; + + for (; zz<=zz_max; zz+=HS_LANES_PER_WARP) + shared.z[zz] = 0; +#else + // ERROR -- it's highly unlikely that the zero type is smaller than + // the meta type +#error("Unsupported right now...") +#endif + + // + // ACCUMULATE AND STORE META INFO + // + uint const valid_blocks = valid & blocks; + uint const valid_pks = valid & pks & ~diffs; + SKC_META_TYPE meta = ( 0 ); + +#define SKC_META_LOCAL_ADD(meta) \ + atomic_add(shared.m+HS_REG_LAST(c),meta); + +#define SKC_META_LOCAL_STORE(meta,prev) \ + shared.m[c##prev] = meta; + + // note this is purposefully off by +1 +#define SKC_META_RESET(meta,curr) \ + meta = ((gmem_off + curr) << 8); + +#if 0 + + // FIXME -- this can be tweaked to shift directly +#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ + meta += ((((blocks >> prev) & 1) ) | \ + (((pks >> prev) & 1) << 16) | \ + (((rks >> prev) & 1) << 24)); + +#else + +#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ + if (blocks & (1<= cc_min) && (cc <= cc_max)) + { + uint const c = shared.c[cc]; + + if (c != 0) + atomic_add(metas+cc,c+adjust); + } + + cc += HS_LANES_PER_WARP; + + for (; cc<=cc_max; cc+=HS_LANES_PER_WARP) + { + uint const c = shared.c[cc]; + + if (c != 0) + atomic_add(metas+cc,c+adjust); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c new file mode 100644 index 0000000000..e915dffada --- /dev/null +++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c @@ -0,0 +1,1443 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include +#include +#include +#include + +#include "common/cl/assert_cl.h" + +#include "context.h" +#include "handle.h" +#include "grid.h" +#include "path.h" +#include "path_builder.h" + +#include "config_cl.h" +#include "export_cl_12.h" +#include "runtime_cl_12.h" +#include "path_builder_cl_12.h" + +// +// OpenCL 1.2 devices support mapping of buffers into the host address +// space. +// +// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit +// boundary (e.g. 128 bytes). This complicates coordinating sharing +// of data between the host and the device. +// +// Some OpenCL 2.0 devices support fine-grained shared virtual memory +// pointers with byte-addressing and allow simpler coordination +// strategies at the cost of maintaining cache coherency. +// +// The path builder is focused on moving bulk path data from the host +// into the device-managed "block" memory pool and arranging it into a +// SIMT/SIMD-friendly data structure that can be efficiently read by +// the rasterizer. +// +// Note that one simplifying assumption is that the maximum length of +// a *single* path can't be larger than what fits in the single extent +// (which is split into M subbuffers). This would be a very long path +// and a legitimate size limitation. +// +// For some systems, it may be appropriate to never pull path data +// into the device-managed block pool and instead present the path +// data to the device in a temporarily available allocated memory +// "zone" of paths that can be discarded all at once. +// +// For other systems, it may be appropriate to simply copy the path +// data from host to device. +// +// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be +// targeting support basic map/unmap functionality similar to OpenCL +// 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained +// sharing of memory and still require a map/unmap step... but note +// that they all support byte-aligned mapping and subbuffers. +// +// The general strategy that this particular CL_12 implementation uses +// is to allocate a large mappable bulk-data path buffer and an +// auxilary mappable command buffer. +// +// The buffers are split into a reasonable number of properly aligned +// subbuffers to enable simultaneous host and device access. +// + +// +// Blocks: +// 1 extent +// M mapped subbuffers (configurable) to allow for concurrency +// +// Commands: +// 1 extent +// M mapped subbuffers (configurable) to allow for concurrency +// +// Spans: +// M hi/lo structures +// +// { cl_sub, void*, event, base } +// +// - size of sub buffer +// - remaining +// +// - counts +// + +// +// For any kernel launch, at most one path will be discontiguous and +// defined across two sub-buffers. +// +// Nodes are updated locally until full and then stored so they will +// never be incomplete. Headers are stored locally until the path is +// ended so they will never be incomplete. +// +// A line, quad or cubic acquires 4/6/8 segments which may be spread +// across one or more congtiguous blocks. +// +// If a flush() occurs then the remaining columns of multi-segment +// paths are initialized with zero-length line, quad, cubic elements. +// +// Every block's command word has a type and a count acquired from a +// rolling counter. +// +// The kernel is passed two spans of blocks { base, count } to +// process. The grid is must process (lo.count + hi.count) blocks. +// + +struct skc_subbuffer_blocks +{ + cl_mem device; + void * host; +}; + +struct skc_subbuffer_cmds +{ + cl_mem device; + void * host; + cl_event map; +}; + +// +// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer ) +// + +typedef skc_uint skc_ringdex_t; + +union skc_ringdex_expand +{ + div_t qr; + + struct { +#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0 + skc_uint subbuf; + skc_uint block; +#else + skc_uint block; + skc_uint subbuf; +#endif + }; +}; + +// +// this record is executed by the grid +// + +struct skc_release_record +{ + struct skc_path_builder_impl * impl; // back pointer to impl + + skc_grid_t grid; // pointer to scheduled grid + + skc_uint from; // inclusive starting index : [from,to) + skc_uint to; // non-inclusive ending index : [from,to) +}; + +// +// +// + +struct skc_path_builder_impl +{ + struct skc_path_builder * path_builder; + + struct skc_runtime * runtime; + + cl_command_queue cq; + + struct { + cl_kernel alloc; + cl_kernel copy; + } kernels; + + // + // FIXME -- make this pointer to constant config + // + // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv + struct { + skc_uint subbufs; // how many subbufs in the buffer? + + struct { + skc_uint buffer; // how many blocks in the buffer? + skc_uint subbuf; // how many blocks in a subbuf? + } blocks_per; + } ring; + // + // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^ + // + + struct { + cl_mem buffer; // backing buffer for blocks + struct skc_subbuffer_blocks * subbufs; // array of structures + } blocks; + + struct { + cl_mem buffer; // backing buffer for commands + struct skc_subbuffer_cmds * subbufs; // array of structures + } cmds; + + struct { + struct skc_release_record * records; // max release records is equal to max subbufs + skc_path_t * paths; // max paths is less than or equal to max commands + } release; + + cl_mem reads; // each kernel only requires one word to store the block pool "base" + + struct { + skc_uint rolling; // rolling counter used by cmds to map to block pool alloc + skc_ringdex_t from; + skc_ringdex_t to; + } prev; + + struct { + skc_ringdex_t from; + skc_ringdex_t to; + } curr; + + struct { + struct skc_path_head * head; // pointer to local path header -- not written until path end + struct skc_path_node * node; // pointer to local node -- may alias head until head is full + + struct { + skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated + union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids + skc_uint rem; // how many id slots left in node block + } ids; + + struct { + skc_uint rem; // how many subblocks left in block? + skc_uint rolling; // rolling counter of block of subblocks + float * next; // next subblock in current subblock block + skc_uint idx; // index of next subblock + } subblocks; + + struct { + skc_uint one; // .block = 1 + skc_uint next; // rolling counter used by cmds to map to block pool alloc + } rolling; + + skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current + } wip; +}; + +// +// FIXME -- move to a pow2 subbuffer size and dispense with division +// and modulo operations +// + +static +union skc_ringdex_expand +skc_ringdex_expand(struct skc_path_builder_impl * const impl, + skc_ringdex_t const ringdex) +{ + return (union skc_ringdex_expand){ + .qr = div(ringdex,impl->ring.blocks_per.subbuf) + }; +} + +static +void +skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl) +{ + // + // FIXME - which is faster? + // +#if 1 + impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer; +#else + impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to; +#endif + + // this path is too long -- for now assert() and die + assert(impl->wip.to != impl->curr.from); +} + +static +skc_ringdex_t +skc_ringdex_span(struct skc_path_builder_impl * const impl, + skc_ringdex_t const from, + skc_ringdex_t const to) +{ + return (to - from) % impl->ring.blocks_per.buffer; +} + +static +void +skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl) +{ + union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); + + // nothing to do if this is the first block in the subbuf + if (to.block == 0) + return; + + skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs; + + // otherwise increment and mod + impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf; +} + +static +skc_bool +skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl) +{ + return impl->curr.from == impl->curr.to; +} + +static +skc_bool +skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl) +{ + return impl->prev.from == impl->prev.to; +} + +static +skc_uint +skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, + skc_uint const to_block) +{ + // no blocks acquired OR this is last block in subbuf + return !((impl->wip.to == impl->curr.to) || (to_block == 0)); +} + +// +// +// + +static +struct skc_release_record * +skc_release_curr(struct skc_path_builder_impl * const impl) +{ + union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); + + return impl->release.records + curr_from.subbuf; +} + +// +// FIXME -- get rid of all distant config references -- grab them at all at creation time +// + +static +void +skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl) +{ + // init header counters // { handle, blocks, nodes, prims } + impl->wip.head->header = (union skc_path_header){ + .handle = 0, + .blocks = 0, + .nodes = 0, + .prims = 0 + }; + + // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS + impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN }; + + // point wip ids at local head node + impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node + impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere + + // start with no subblocks + impl->wip.subblocks.rem = 0; +} + +// +// +// + +static +void +skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl) +{ +#if 1 + // + // FIXME -- a Duff's device might be optimal here but would have to + // be customized per device since node's could be 16-128+ words + // + while (impl->wip.ids.rem > 0) + { + impl->wip.ids.rem -= 1; + impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID; + impl->wip.ids.next += 1; + } +#else + memset(&impl->wip.ids.next->u32, + SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF + sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem); + + impl->wip.ids.next += impl->wip.ids.rem; + impl->wip.ids.rem = 0; +#endif +} + +// +// +// + +static +void +skc_zero_float(skc_float * p, skc_uint rem) +{ + memset(p,0,sizeof(*p)*rem); +} + +static +void +skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder) +{ + // + // FIXME -- it might be more performant to zero the remaining + // columns in a subblock -- a subblock at a time -- instead of the + // same column across all the subblocks + // +#if 0 + while (path_builder->line.rem > 0) + { + --path_builder->line.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + } + + while (path_builder->quad.rem > 0) + { + --path_builder->quad.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + *path_builder->line.coords[4]++ = 0.0f; + *path_builder->line.coords[5]++ = 0.0f; + } + + while (path_builder->cubic.rem > 0) + { + --path_builder->cubic.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + *path_builder->line.coords[4]++ = 0.0f; + *path_builder->line.coords[5]++ = 0.0f; + *path_builder->line.coords[6]++ = 0.0f; + *path_builder->line.coords[7]++ = 0.0f; + } +#else + if (path_builder->line.rem > 0) + { + skc_zero_float(path_builder->line.coords[0],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[1],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[2],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[3],path_builder->line.rem); + + path_builder->line.rem = 0; + } + + if (path_builder->quad.rem > 0) + { + skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem); + + path_builder->quad.rem = 0; + } + + if (path_builder->cubic.rem > 0) + { + skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem); + + path_builder->cubic.rem = 0; + } +#endif +} + +// +// +// + +static +void +skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl, + skc_uint from, + skc_uint to) +{ + // to might be out of range + to = to % impl->ring.subbufs; + +#if 0 + fprintf(stderr,"unmap: [%2u,%2u)\n",from,to); +#endif + + while (from != to) // 'to' might be out of range + { + // bring 'from' back in range + from = from % impl->ring.subbufs; + + struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; + struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; + + cl(EnqueueUnmapMemObject(impl->cq, + blocks->device, + blocks->host, + 0,NULL,NULL)); + + cl(EnqueueUnmapMemObject(impl->cq, + cmds->device, + cmds->host, + 0,NULL,NULL)); + + // bring from back in range + from = ++from % impl->ring.subbufs; + } +} + +// +// FIXME -- reuse this in create() +// + +static +void +skc_path_builder_impl_map(struct skc_path_builder_impl * const impl, + skc_uint from, + skc_uint to) +{ + // to might be out of range + to = to % impl->ring.subbufs; + +#if 0 + fprintf(stderr," map: [%2u,%2u)\n",from,to); +#endif + + while (from != to) + { + cl_int cl_err; + + struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; + struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; + + blocks->host = clEnqueueMapBuffer(impl->cq, + blocks->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,impl->runtime->config->paths_copy.block.subbuf, + 0,NULL,NULL, + &cl_err); cl_ok(cl_err); + + cl(ReleaseEvent(cmds->map)); + + cmds->host = clEnqueueMapBuffer(impl->cq, + cmds->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,impl->runtime->config->paths_copy.command.subbuf, + 0,NULL,&cmds->map, + &cl_err); cl_ok(cl_err); + + // bring from back in range + from = ++from % impl->ring.subbufs; + } + // + // FIXME -- when we switch to out of order queues we'll need a barrier here + // +} + +// +// +// + +static +void +skc_path_builder_release_dispose(struct skc_release_record * const release, + struct skc_path_builder_impl * const impl) +{ + struct skc_runtime * runtime = impl->runtime; + + if (release->from <= release->to) // no wrap + { + skc_path_t const * paths = impl->release.paths + release->from; + skc_uint count = release->to - release->from; + + skc_grid_deps_unmap(runtime->deps,paths,count); + skc_runtime_path_device_release(runtime,paths,count); + } + else // from > to implies wrap + { + skc_path_t const * paths_lo = impl->release.paths + release->from; + skc_uint count_lo = impl->ring.blocks_per.buffer - release->from; + + skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo); + skc_runtime_path_device_release(runtime,paths_lo,count_lo); + + skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to); + skc_runtime_path_device_release(runtime,impl->release.paths,release->to); + } + + release->to = release->from; +} + +static +void +skc_path_builder_grid_pfn_dispose(skc_grid_t const grid) +{ + struct skc_release_record * const release = skc_grid_get_data(grid); + struct skc_path_builder_impl * const impl = release->impl; + + skc_path_builder_release_dispose(release,impl); +} + +static +void +// skc_path_builder_complete(struct skc_release_record * const release) +skc_path_builder_complete(skc_grid_t grid) +{ + // + // notify deps that this grid is complete enough for other grids to + // proceed + // + // the path builder still has some cleanup to do before all its + // resources can be reused + // + skc_grid_complete(grid); +} + +static +void +skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid) +{ + SKC_CL_CB(status); + + struct skc_release_record * const release = skc_grid_get_data(grid); + + SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid); +} + +// +// +// + +static +void +skc_path_builder_grid_pfn_waiting(skc_grid_t const grid) +{ + struct skc_release_record * const release = skc_grid_get_data(grid); + struct skc_path_builder_impl * const impl = release->impl; + + // 1. flush incomplete subblocks of path elements + // 2. unmap subbuffer on cq.unmap + // 3. flush cq.unmap + // 4. launch kernel on cq.kernel but wait for unmap completion + // 5. flush cq.kernel + // 6. remap relevant subbuffers on cq.map but wait for kernel completion + // 7. flush cq.map + + // + // FIXME -- can be smarter about flushing if the wip paths are not + // in the same subbuf as curr.to + // + // THIS IS IMPORTANT TO FIX + // + + // flush incomplete subblocks + skc_path_builder_finalize_subblocks(impl->path_builder); + + // + // get range of subbufs that need to be unmapped + // + // note that impl->prev subbufs have already been unmapped + // + union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); + union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to); + skc_uint const is_partial = curr_to.block > 0; + skc_uint const unmap_to = curr_to.subbuf + is_partial; + + // + // unmap all subbufs in range [from,to) + // + skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to); + + // + // launch kernels + // + skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to); + skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to); + skc_uint const pb_cmds = pb_prev_span + pb_curr_span; + + // + // 1) allocate blocks from pool + // + + // + // FIXME -- pack integers into struct/vector + // + cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads))); + cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf))); + cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds))); + + skc_device_enqueue_kernel(impl->runtime->device, + SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, + impl->cq, + impl->kernels.alloc, + 1, + 0,NULL,NULL); + + // + // 2) copy blocks from unmapped device-accessible memory + // + + // + // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7 + // + cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw))); + + cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads))); + cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf))); + + cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer))); + cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer))); + + cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer))); + cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling))); + + cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from))); + cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span))); + cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from))); + + cl_event complete; + + skc_device_enqueue_kernel(impl->runtime->device, + SKC_DEVICE_KERNEL_ID_PATHS_COPY, + impl->cq, + impl->kernels.copy, + pb_cmds, + 0,NULL,&complete); + + // set a callback on completion + cl(SetEventCallback(complete,CL_COMPLETE, + skc_path_builder_paths_copy_cb, + grid)); + + // immediately release + cl(ReleaseEvent(complete)); + + // + // remap as many subbuffers as possible after the kernel completes + // + // note that remaps are async and enqueued on the same command queue + // as the kernel launch + // + // we can't remap subbuffers that are in the possibly empty range + // + // cases: + // + // - curr.to == wip.to which means no blocks have been acquired + // - curr.to points to first block in (next) subbuf + // - otherwise, wip acquired blocks in the curr.to subbuf + // + // check for these first 2 cases! + // + union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from); + skc_uint const no_wip = impl->curr.to == impl->wip.to; + skc_uint map_to = curr_to.subbuf + (is_partial && no_wip); + + // remap all subbufs in range [from,to) + skc_path_builder_impl_map(impl,prev_from.subbuf,map_to); + + // flush command queue + cl(Flush(impl->cq)); + + // save rolling + impl->prev.rolling = impl->wip.rolling.next; + + // update prev and curr + if (no_wip) + { + // + // if there was no wip then round up to the next subbuf + // + skc_ringdex_wip_to_subbuf_inc(impl); + + // + // update prev/curr with with incremented wip + // + impl->prev.from = impl->prev.to = impl->wip.to; + impl->curr.from = impl->curr.to = impl->wip.to; + } + else + { + // + // update prev with wip partials + // + impl->prev.from = impl->curr.to; + impl->prev.to = impl->wip .to; + + // + // start curr on a new subbuf boundary + // + skc_ringdex_wip_to_subbuf_inc(impl); + + impl->curr.from = impl->wip.to; + impl->curr.to = impl->wip.to; + } +} + +// +// +// + +static +void +skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl, + skc_uint const subbuf) +{ + // + // FIXME -- move to a power-of-two subbuf size and kickstart path + // copies as early as possible + // + // FIXME -- the subbufs "self-clock" (flow control) the kernel + // launches and accounting. Combine all the subbuffers and release + // records into a single indexable struct instead of 3. + // + struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf; + struct skc_release_record * const release = impl->release.records + subbuf; + struct skc_scheduler * const scheduler = impl->runtime->scheduler; + + // can't proceed until the paths have been released + SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to); + + // throw in a scheduler yield ... FIXME -- get rid of + skc_scheduler_yield(scheduler); + + // can't proceed until the subbuffer is mapped + cl(WaitForEvents(1,&sc->map)); +} + +// +// +// + +static +union skc_ringdex_expand +skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl) +{ + // break ringdex into components + union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); + + // does wip ringdex point to a new subbuffer? + if (to.block == 0) + { + // potentially spin/block waiting for subbuffer + skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf); + } + + // post increment wip.to + skc_ringdex_wip_to_block_inc(impl); + + return to; +} + +// +// +// + +static +skc_uint +skc_rolling_block(skc_uint const rolling, skc_uint const tag) +{ + return rolling | tag; +} + +static +skc_uint +skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag) +{ + return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag; +} + +static +void +skc_rolling_inc(struct skc_path_builder_impl * const impl) +{ + impl->wip.rolling.next += impl->wip.rolling.one; +} + +// +// +// + +static +void * +skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl, + skc_uint const rolling, + skc_cmd_paths_copy_tag const tag) +{ + // bump blocks count + impl->wip.head->header.blocks += 1; + + // acquire a block + union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl); + + // make a pointer + union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host; + + // store command for block + cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag); + +#if 0 + // store command for block + cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag); + + // increment rolling + skc_rolling_inc(impl); +#endif + + // return pointer to block + float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host; + + // FIXME -- make it easier to get config constant + return blocks_subbuf + (to.block * impl->runtime->config->block.words); +} + +// +// +// + +static +void +skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl) +{ + // store command to subbuf and get pointer to blocks subbuf + void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling, + SKC_CMD_PATHS_COPY_TAG_NODE); + + // copy head to blocks subbuf -- write-only + memcpy(block,impl->wip.node,impl->runtime->config->block.bytes); +} + +static +void +skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl) +{ + // store command to subbuf and get pointer to blocks subbuf + void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, + SKC_CMD_PATHS_COPY_TAG_HEAD); + + // copy head to blocks subbuf -- write-only + memcpy(block,impl->wip.head,impl->runtime->config->block.bytes); + + // increment rolling + skc_rolling_inc(impl); + + // the 'to' index is non-inclusive so assign wip.to after flush_head + impl->curr.to = impl->wip.to; +} + +// +// +// + +static +void +skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl) +{ + // update final block id in node + impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT); + + // if wip.ids is not the header then flush now full wip node + if (impl->wip.head->header.nodes > 0) + skc_path_builder_impl_flush_node(impl); + + // bump node count + impl->wip.head->header.nodes += 1; + + // save current rolling + impl->wip.ids.rolling = impl->wip.rolling.next; + + // increment rolling + skc_rolling_inc(impl); + + // update wip.ids.* + impl->wip.ids.next = impl->wip.node->tag_ids; + impl->wip.ids.rem = impl->runtime->config->block.words; +} + +static +void +skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl) +{ + impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure + impl->wip.subblocks.rolling = impl->wip.rolling.next; + impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, + SKC_CMD_PATHS_COPY_TAG_SEGS); + impl->wip.subblocks.idx = 0; + + // increment rolling + skc_rolling_inc(impl); +} + +// +// +// + +static +void +skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl, + skc_block_id_tag tag, + skc_uint vertices, + float * * subblocks) +{ + // + // FIRST TAG RECORDS THE ELEMENT TYPE + // + while (true) + { + // if only one block id left in node then acquire new node block + // and append its block id as with a next tag + if (impl->wip.ids.rem == 1) + skc_path_builder_impl_new_node_block(impl); + + // if zero subblocks left then acquire a new subblock block and + // append its block id + if (impl->wip.subblocks.rem == 0) + skc_path_builder_impl_new_segs_block(impl); + + // save first command -- tag and subblocks may have been updated + impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag); + + // increment node block subblock pointer + impl->wip.ids.next += 1; + impl->wip.ids.rem -= 1; + + // how many vertices can we store + skc_uint rem = min(vertices,impl->wip.subblocks.rem); + + // decrement vertices + vertices -= rem; + impl->wip.subblocks.rem -= rem; + impl->wip.subblocks.idx += rem; + + // assign subblocks + do { + *subblocks++ = impl->wip.subblocks.next; + impl->wip.subblocks.next += impl->runtime->config->subblock.words; + // FIXME -- move constants closer to structure + } while (--rem > 0); + + // anything left to do? + if (vertices == 0) + break; + + // any tag after this will be a caboose command + tag = SKC_BLOCK_ID_TAG_PATH_NEXT; + } +} + +// +// +// + +static +void +skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path) +{ + // finalize incomplete active subblocks -- we don't care about any + // remaining unused subblocks in block + skc_path_builder_finalize_subblocks(impl->path_builder); + + // mark remaining wips.ids in the head or node as invalid + skc_path_builder_impl_finalize_node(impl); + + // flush node if rem > 0 and node is not actually head + if (impl->wip.head->header.nodes >= 1) + skc_path_builder_impl_flush_node(impl); + + // acquire path host id + *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN + + // save path host handle + impl->wip.head->header.handle = *path; + + // flush head -- acquires a block and bumps head->header.blocks + skc_path_builder_impl_flush_head(impl); + + // get current release + struct skc_release_record * const release = skc_release_curr(impl); + + // acquire grid if null + if (release->grid == NULL) + { + release->grid = + SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + &release->grid, // NULL on start/force + release, // data payload + skc_path_builder_grid_pfn_waiting, + NULL, // no execute pfn + skc_path_builder_grid_pfn_dispose); + } + + // update grid map + skc_grid_map(release->grid,*path); + + // update path release + impl->release.paths[release->to] = *path; + + // increment release.to + release->to = (release->to + 1) % impl->ring.blocks_per.buffer; + + // add guard bit + *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH; + +#if 1 + // + // eager kernel launch? + // + { + union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from); + union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to); + + if (curr_from.subbuf != curr_to.subbuf) + { + skc_grid_start(release->grid); + // skc_scheduler_yield(impl->runtime->scheduler); + } + } +#endif +} + +// +// FIXME -- clean up accessing of CONFIG constants in these 3 routines +// + +static +void +skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4, + impl->path_builder->line.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->line.rem = impl->runtime->config->subblock.words; +} + +static +void +skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6, + impl->path_builder->quad.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->quad.rem = impl->runtime->config->subblock.words; +} + +static +void +skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8, + impl->path_builder->cubic.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->cubic.rem = impl->runtime->config->subblock.words; +} + +// +// +// + +static +void +skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl) +{ + // decrement reference count + if (--impl->path_builder->refcount != 0) + return; + + // + // otherwise, dispose of everything + // + struct skc_runtime * const runtime = impl->runtime; + + // free path builder + skc_runtime_host_perm_free(impl->runtime,impl->path_builder); + + // release cq + skc_runtime_release_cq_in_order(runtime,impl->cq); + + // release kernels + cl(ReleaseKernel(impl->kernels.alloc)); + cl(ReleaseKernel(impl->kernels.copy)); + + // free blocks extents + cl(ReleaseMemObject(impl->blocks.buffer)); + skc_runtime_host_perm_free(runtime,impl->blocks.subbufs); + + cl(ReleaseMemObject(impl->cmds.buffer)); + skc_runtime_host_perm_free(runtime,impl->cmds.subbufs); + + // free records + skc_runtime_host_perm_free(runtime,impl->release.records); + skc_runtime_host_perm_free(runtime,impl->release.paths); + + // release staging head and node + skc_runtime_host_perm_free(runtime,impl->wip.head); + skc_runtime_host_perm_free(runtime,impl->wip.node); + + // release reads scratch array + cl(ReleaseMemObject(impl->reads)); + + // for all subbuffers + // unmap subbuffer + // release subbuffer + // printf("%s not releasing subbuffers\n",__func__); + + skc_runtime_host_perm_free(impl->runtime,impl); +} + +// +// +// + +skc_err +skc_path_builder_cl_12_create(struct skc_context * const context, + struct skc_path_builder * * const path_builder) +{ + // + // retain the context + // skc_context_retain(context); + // + struct skc_runtime * const runtime = context->runtime; + + // allocate path builder + (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder)); + + // init state + SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY); + + (*path_builder)->context = context; + + // save opaque impl-specific pointers + (*path_builder)->begin = skc_path_builder_pfn_begin; + (*path_builder)->end = skc_path_builder_pfn_end; + (*path_builder)->new_line = skc_path_builder_pfn_new_line; + (*path_builder)->new_quad = skc_path_builder_pfn_new_quad; + (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic; + (*path_builder)->release = skc_path_builder_pfn_release; + + // initialize path builder counts + (*path_builder)->line.rem = 0; + (*path_builder)->quad.rem = 0; + (*path_builder)->cubic.rem = 0; + + (*path_builder)->refcount = 1; + + struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + (*path_builder)->impl = impl; + + // + // init impl + // + impl->path_builder = *path_builder; + impl->runtime = runtime; + + impl->cq = skc_runtime_acquire_cq_in_order(runtime); + + impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC); + impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY); + + // + // FIXME -- let these config constants remain constant and in place + // + struct skc_config const * const config = runtime->config; + + impl->ring.subbufs = config->paths_copy.buffer.count; + impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count; + impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count; + // + // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + // + + cl_int cl_err; + + // allocate large device-side extent for path data + impl->blocks.buffer = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere + NULL,&cl_err); cl_ok(cl_err); + + // allocate small host-side array of pointers to mapped subbufs + impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->blocks.subbufs)); + + // allocate large device-side extent for path copy commands + impl->cmds.buffer = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + config->paths_copy.command.buffer, + NULL,&cl_err); cl_ok(cl_err); + + // allocate small host-side array of pointers to mapped subbufs + impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->cmds.subbufs)); + + // allocate small host-side array of intervals of path handles + impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->release.records)); + + // allocate large host-side array that is max # of path handles in flight + impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.blocks_per.buffer * + sizeof(*impl->release.paths)); + + // small scratch used by kernels + impl->reads = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + sizeof(skc_uint) * impl->ring.subbufs, + NULL,&cl_err); cl_ok(cl_err); + + // initialize release record with impl backpointer + for (skc_uint ii=0; iiring.subbufs; ii++) + { + struct skc_release_record * record = impl->release.records + ii; + + record->impl = impl; + record->grid = NULL; + record->from = record->to = ii * impl->ring.blocks_per.subbuf; + } + + // + // allocate and map subbuffers -- we always check the command + // subbuffer's map/unmap events before touching it or its associated + // block subbuffer. + // + struct skc_subbuffer_blocks * sb = impl->blocks.subbufs; + struct skc_subbuffer_cmds * sc = impl->cmds .subbufs; + + cl_buffer_region rb = { 0, config->paths_copy.block.subbuf }; + cl_buffer_region rc = { 0, config->paths_copy.command.subbuf }; + + // for each subbuffer + for (skc_uint ii=0; iipaths_copy.buffer.count; ii++) + { + sb->device = clCreateSubBuffer(impl->blocks.buffer, + CL_MEM_HOST_WRITE_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + &rb, + &cl_err); cl_ok(cl_err); + + sb->host = clEnqueueMapBuffer(impl->cq, + sb->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,rb.size, + 0,NULL,NULL, + &cl_err); cl_ok(cl_err); + + sc->device = clCreateSubBuffer(impl->cmds.buffer, + CL_MEM_HOST_WRITE_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + &rc, + &cl_err); cl_ok(cl_err); + + sc->host = clEnqueueMapBuffer(impl->cq, + sc->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,rc.size, + 0,NULL,&sc->map, + &cl_err); cl_ok(cl_err); + sb += 1; + sc += 1; + + rb.origin += rb.size; + rc.origin += rc.size; + } + + // + // initialize remaining members + // + impl->prev.from = 0; + impl->prev.to = 0; + impl->prev.rolling = 0; + + impl->curr.from = 0; + impl->curr.to = 0; + + impl->wip.to = 0; + + impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); + impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); + + impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks; + impl->wip.rolling.next = 0; + + // for now, completely initialize builder before returning + cl(Finish(impl->cq)); + + return SKC_ERR_SUCCESS; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.h b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h new file mode 100644 index 0000000000..20bb13cbdf --- /dev/null +++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h @@ -0,0 +1,44 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef PATH_BUILDER_CL_12_ONCE +#define PATH_BUILDER_CL_12_ONCE + +// +// +// + +#include "block.h" + +// +// A tag type that fits into the block id tag bitfield +// + +typedef enum skc_cmd_paths_copy_tag { + + SKC_CMD_PATHS_COPY_TAG_SEGS, + SKC_CMD_PATHS_COPY_TAG_NODE, + SKC_CMD_PATHS_COPY_TAG_HEAD, + + SKC_CMD_PATHS_COPY_TAG_COUNT + +} skc_cmd_paths_copy_tag; + + +SKC_STATIC_ASSERT(SKC_CMD_PATHS_COPY_TAG_COUNT <= SKC_BLOCK_ID_TAG_COUNT); + +// +// +// + +#endif + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c new file mode 100644 index 0000000000..33992cbdfb --- /dev/null +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c @@ -0,0 +1,1349 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +// get rid of these +#include +#include + +// +// +// + +#include "hs/cl/hs_cl_launcher.h" + +#include "common/cl/assert_cl.h" + +#include "context.h" +#include "grid.h" +#include "raster.h" +#include "extent_ring.h" +#include "raster_builder.h" + +#include "tile.h" + +#include "config_cl.h" +#include "runtime_cl_12.h" +#include "extent_cl_12.h" +#include "raster_builder_cl_12.h" + +// +// RASTERIZATION SUB-PIPELINE +// -------------------------- +// +// Phase 1: expand commands +// +// Phase 2: rasterize +// +// Phase 3: sort & segment || release paths +// +// Phase 4: prefix +// +// Phase 5: release rasters +// +// RASTER COHORT +// ============== +// +// BUILDER RASTERIZER POST PROCESSING +// <-----------------------------------------------> <------------> <---------------------------------------------------------------------> +// +// fill cmds transforms raster clips path release rasterize cmds cohort map raster release TTSB TTSK cohort atomics context atomics +// --------- ---------- ------------ ------------ -------------- ---------- -------------- ---- ---- -------------- --------------- +// 1,2 1,2 1,2 1,2 2 1-4 1,2,3,4 2-4 2-4 2-4 global +// +// +// NOTES: FINE-GRAINED SVM +// ----------------------- +// +// 1) In a fine-grained system we know the exact number of +// rasterize cmds per segment type before phase 1 +// +// 2) A raster that's "under construction" shouldn't be rasterized +// until it is complete. This implies that a raster is not part +// of a cohort until it is complete. The raster builder must +// handle raster promises being "forced" to completion -- this is +// likely the result of composition construction and subsequent +// rendering to a surface. +// +// 3) The raster cohort rasterizer state retains the fill cmd, +// transform, raster clip and path release "ring" extents. +// +// 4) The rasterize cmd extent sizes (line, quad, cubic, rational +// quad, rational cubic) are known ahead of time. +// +// 5) The raster cohort post processor is standalone and retains the +// raster_map, cohort atomics, TTSK_RYX extent, and raster +// references until complete. +// + +// +// Notes: +// +// - Could have a pipeline stage before expansion count the exact +// number of line/quad/cubic commands but the command buffers are +// relatively small (64-bit commands * # of path segments). +// + +// raster +// cohort atomics path_ids raster_ids transforms clips cmds_fill cmds_l/q/c ttsk_ryx +// +// +// BEGIN ^ +// | +// EXPAND | +// | +// RASTERIZE | +// | +// SORT || RELEASE PATHS | +// | +// PREFIX | +// | +// RELEASE RASTERS | +// | +// END v +// +// +// BEGIN +// +// EXPAND -- PRODUCES: one or more extents of rasterization commands +// +// RASTERIZE -- DEPENDENCY: requires size of command extents before launching +// -- PRODUCES: an extent of ttsk_ryx keys +// +// SORT || RELEASE PATHS -- DEPENDENCY: requires size of key extent before launching +// -- PRODUCES: sorted array of keys +// +// PREFIX -- DEPENDENCY: none -- can execute after SORT because grid size is number of rasters +// +// RELEASE RASTERS -- DEPENDENCY: none -- can execute after prefix +// +// END +// + +// ------------------------ +// +// DEPENDENCY is cleanly implemented with a host callback or device kernel launcher +// +// Can this hide resource acquisition? Yes. But there are two cases: +// +// 1. acqusition of resources occurs on the host thread and lack of +// resources drains the host command queue until resources are +// available (OpenCL 2.x) +// +// 2. the host commands lazily acquire resources (OpenCL 1.2) +// +// ------------------------ +// +// How to express? +// +// Each substage launches its successors. This supports both dependency models. +// +// If OpenCL 1.2 then the substage can't be launched until the prior +// stage's event is complete. So this requires registering a callback +// to invoke the substage. +// +// ------------------------ + +// +// BUILD +// + +struct skc_raster_builder_impl +{ + struct skc_raster_builder * raster_builder; + struct skc_runtime * runtime; + + skc_grid_t cohort; + + // these are all durable/perm extents + struct skc_extent_phrwg_thr1s path_ids; // read/write by host + struct skc_extent_phw1g_tdrNs transforms; // write once by host + read by device + struct skc_extent_phw1g_tdrNs clips; // write once by host + read by device + struct skc_extent_phw1g_tdrNs fill_cmds; // write once by host + read by device + struct skc_extent_phrwg_tdrNs raster_ids; // read/write by host + read by device + + struct { + cl_kernel fills_expand; + cl_kernel rasterize_all; + cl_kernel segment; + cl_kernel rasters_alloc; + cl_kernel prefix; + } kernels; +}; + +// +// RASTER COHORT +// +// This sub-pipeline snapshots the raster builder and then acquires +// and releases host and device resources as necessary (as late as +// possible). +// +// Note that the cohort extents are ephemeral and are only used by one +// or more stages of a the rasterization sub-pipeline. +// +// The pipeline implementation may vary between compute platforms. +// + +struct skc_raster_cohort +{ + struct skc_raster_builder_impl * impl; + + struct skc_extent_phrwg_thr1s_snap path_ids; // read/write by host + struct skc_extent_phw1g_tdrNs_snap transforms; // write once by host + read by device + struct skc_extent_phw1g_tdrNs_snap clips; // write once by host + read by device + struct skc_extent_phw1g_tdrNs_snap fill_cmds; // write once by host + read by device + struct skc_extent_phrwg_tdrNs_snap raster_ids; // read/write by host + read by device + + cl_command_queue cq; + + // sub-pipeline atomics + struct skc_extent_thr_tdrw atomics; + + // path primitives are expanded into line/quad/cubic/rational cmds + struct skc_extent_tdrw cmds; + + // rasterization output + struct skc_extent_tdrw keys; + // struct skc_extent_thrw_tdrw keys; + + // post-sort extent with metadata for each raster + struct skc_extent_tdrw metas; + // struct skc_extent_thrw_tdrw metas; + + // subbuf id + skc_subbuf_id_t id; + + // + // pipeline also uses the following global resources: + // + // - command queue from global factory + // - global block pool and its atomics + // - global path and raster host id map + // - temporary host and device allocations + // +}; + +// +// TTRK (64-BIT COMPARE) +// +// 0 63 +// | TTSB ID | X | Y | COHORT ID | +// +---------+------+------+-----------+ +// | 27 | 12 | 12 | 13 | +// +// +// TTRK (32-BIT COMPARE) +// +// 0 63 +// | TTSB ID | N/A | X | Y | COHORT ID | +// +---------+-----+------+------+-----------+ +// | 27 | 5 | 12 | 12 | 8 | +// + +// +// TTRK is sortable intermediate key format for TTSK +// +// We're going to use the 32-bit comparison version for now +// + +union skc_ttrk +{ + skc_ulong u64; + skc_uint2 u32v2; + + struct { + skc_uint block : SKC_TTXK_LO_BITS_ID; + skc_uint na0 : SKC_TTRK_LO_BITS_NA; + skc_uint x : SKC_TTXK_HI_BITS_X; + skc_uint y : SKC_TTXK_HI_BITS_Y; + skc_uint cohort : SKC_TTRK_HI_BITS_COHORT; + }; + + struct { + skc_uint na1; + skc_uint yx : SKC_TTXK_HI_BITS_YX; + skc_uint na2 : SKC_TTRK_HI_BITS_COHORT; + }; + + struct { + skc_uint na3; + skc_uint na4 : SKC_TTXK_HI_BITS_X; + skc_uint cohort_y : SKC_TTRK_HI_BITS_COHORT_Y; + }; +}; + +// +// +// + +static +void +skc_raster_builder_pfn_release(struct skc_raster_builder_impl * const impl) +{ + // decrement reference count + if (--impl->raster_builder->refcount != 0) + return; + + // + // otherwise, dispose of the the raster builder and its impl + // + struct skc_runtime * const runtime = impl->runtime; + + // free the raster builder + skc_runtime_host_perm_free(runtime,impl->raster_builder); + + // free durable/perm extents + skc_extent_phrwg_thr1s_free(runtime,&impl->path_ids); + skc_extent_phw1g_tdrNs_free(runtime,&impl->transforms); + skc_extent_phw1g_tdrNs_free(runtime,&impl->clips); + skc_extent_phw1g_tdrNs_free(runtime,&impl->fill_cmds); + skc_extent_phrwg_tdrNs_free(runtime,&impl->raster_ids); + + // release kernels + cl(ReleaseKernel(impl->kernels.fills_expand)); + cl(ReleaseKernel(impl->kernels.rasterize_all)); + +#if 0 + cl(ReleaseKernel(impl->kernels.rasterize_lines)); + cl(ReleaseKernel(impl->kernels.rasterize_quads)); + cl(ReleaseKernel(impl->kernels.rasterize_cubics)); +#endif + + cl(ReleaseKernel(impl->kernels.segment)); + cl(ReleaseKernel(impl->kernels.rasters_alloc)); + cl(ReleaseKernel(impl->kernels.prefix)); + + // free the impl + skc_runtime_host_perm_free(runtime,impl); +} + +// +// +// + +static +void +skc_raster_builder_rasters_release(struct skc_runtime * const runtime, + skc_raster_t const * const rasters, + skc_uint const size, + skc_uint const from, + skc_uint const to) +{ + if (from <= to) // no wrap + { + skc_raster_t const * rasters_from = rasters + from; + skc_uint count_from = to - from; + + skc_grid_deps_unmap(runtime->deps,rasters_from,count_from); + skc_runtime_raster_device_release(runtime,rasters_from,count_from); + } + else // from > to implies wrap + { + skc_raster_t const * rasters_lo = rasters + from; + skc_uint count_lo = size - from; + + skc_grid_deps_unmap(runtime->deps,rasters_lo,count_lo); + skc_runtime_raster_device_release(runtime,rasters_lo,count_lo); + + skc_grid_deps_unmap(runtime->deps,rasters,to); + skc_runtime_raster_device_release(runtime,rasters,to); + } +} + +static +void +skc_raster_builder_paths_release(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s_snap * const snap) +{ + // release lo + skc_runtime_path_device_release(runtime,snap->hr1.lo,snap->count.lo); + + // release hi + if (snap->count.hi) + skc_runtime_path_device_release(runtime,snap->hr1.hi,snap->count.hi); +} + +static +void +skc_raster_builder_cohort_grid_pfn_dispose(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids - + // raster_ids a + // transforms - + // clips - + // fill_cmds - + // cq a + // cohort atomics a + // cmds - + // keys a + // meta a + // + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // + // release paths -- FIXME -- Note that releasing paths can be + // performed after rasterization is complete + // + + // snap alloc the paths -- this host snap simply sets up pointers + skc_extent_phrwg_thr1s_snap_alloc(runtime,&impl->path_ids,&cohort->path_ids); + + // unmap and release raster ids + skc_raster_builder_paths_release(runtime,&cohort->path_ids); + + // release path ids + skc_extent_phrwg_thr1s_snap_free(runtime,&cohort->path_ids); + + // + // release rasters + // + skc_uint const size = cohort->raster_ids.snap->ring->size.pow2; + skc_uint const from = skc_extent_ring_snap_from(cohort->raster_ids.snap); + skc_uint const to = skc_extent_ring_snap_to(cohort->raster_ids.snap); + + // unmap and release raster ids + skc_raster_builder_rasters_release(runtime,impl->raster_ids.hrw,size,from,to); + + // release cohort's remaining allocated resources + skc_extent_phrwg_tdrNs_snap_free(runtime,&cohort->raster_ids); + skc_runtime_release_cq_in_order(runtime,cohort->cq); + skc_extent_thr_tdrw_free(runtime,&cohort->atomics); + skc_extent_tdrw_free(runtime,&cohort->keys); + skc_extent_tdrw_free(runtime,&cohort->metas); + // skc_extent_thrw_tdrw_free(runtime,&cohort->keys); + // skc_extent_thrw_tdrw_free(runtime,&cohort->metas); + skc_runtime_host_temp_free(runtime,cohort,cohort->id); + + // release the raster builder + skc_raster_builder_pfn_release(impl); + + // + // ALLOCATED RESOURCES + // + // path_ids - + // raster_ids - + // transforms - + // clips - + // fill_cmds - + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // +} + +// +// +// + +static +void +skc_raster_cohort_prefix_release(skc_grid_t const grid) +{ + // FIXME -- note that pfn_dispose can be accomplished here + + // release the grid + skc_grid_complete(grid); +} + +static +void +skc_raster_cohort_prefix_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_scheduler * const scheduler = cohort->impl->runtime->scheduler; + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(scheduler,skc_raster_cohort_prefix_release,grid); +} + +// +// +// + +#if 0 +static +int cmp64(const void * ptr_a, const void * ptr_b) +{ + skc_ulong const a = *(const skc_ulong *)ptr_a; + skc_ulong const b = *(const skc_ulong *)ptr_b; + + if (a < b) return -1; + if (a > b) return +1; + else return 0; +} +#endif + +// +// +// + +static +void +skc_raster_cohort_sort_prefix(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms a + // clips a + // fill_cmds - + // cq a + // cohort atomics a + // cmds a + // keys a + // meta - + // + + // use the backpointers + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // release transforms + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->transforms); + + // release clips + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->clips); + + // release expanded cmds + skc_extent_tdrw_free(runtime,&cohort->cmds); + + // alloc the snapshost -- could be zero-sized + skc_extent_phrwg_tdrNs_snap_alloc(runtime, + &impl->raster_ids, + &cohort->raster_ids, + cohort->cq,NULL); + + // will never be zero + skc_uint const rasters = skc_extent_ring_snap_count(cohort->raster_ids.snap); + + // acquire fixed-size device-side extent + skc_extent_tdrw_alloc(runtime, + &cohort->metas, + sizeof(struct skc_raster_cohort_meta)); + + // skc_extent_thrw_tdrw_alloc(runtime, + // &cohort->metas, + // sizeof(struct skc_raster_cohort_meta)); + + // zero the metas + skc_extent_tdrw_zero(&cohort->metas,cohort->cq,NULL); + + // get the read-only host copy of the device atomics + struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; + + // + // SORT + // + if (atomics->keys > 0) + { +#ifndef NDEBUG + fprintf(stderr,"raster cohort sort: %u\n",atomics->keys); +#endif + + // + // + // + uint32_t keys_padded_in, keys_padded_out; + + hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); + + hs_sort(cohort->cq, + cohort->keys.drw, + cohort->keys.drw, + atomics->keys, + keys_padded_in, + keys_padded_out, + false); + + cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw))); + cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw))); + +#ifndef NDEBUG + fprintf(stderr,"post-sort\n"); +#endif + + // find start of each tile + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK, + cohort->cq, + impl->kernels.segment, + atomics->keys, + 0,NULL,NULL); + +#ifndef NDEBUG + fprintf(stderr,"post-segment\n"); +#endif + + // + // DELETE ALL THIS WHEN READY + // + +#if 0 + // + // + // + cl(Finish(cohort->cq)); + + // map keys to host + union skc_ttrk * const keys = skc_extent_thrw_tdrw_map(&cohort->keys, + cohort->cq, + NULL); + // map meta to host + struct skc_raster_cohort_meta * const metas = skc_extent_thrw_tdrw_map(&cohort->metas, + cohort->cq, + NULL); + // block until done + cl(Finish(cohort->cq)); + + // sort keys + qsort(keys,atomics->keys,sizeof(*keys),cmp64); + + // mask to determine if rk id is a new block + skc_uint const subblock_mask = runtime->config->block.subblocks - 1; + + // + // some counters + // + union skc_raster_cohort_meta_in meta_in = { + .blocks = 0, + .offset = 0, + .pk = 0, + .rk = 0 + }; + + // get first key + union skc_ttrk curr = keys[0]; + + skc_uint ii=0, jj=0; + + // for all TTRK keys + while (true) + { + // increment ttrk count + meta_in.rk += 1; + + // was this a new block? + if ((curr.u32v2.lo & subblock_mask) == 0) + meta_in.blocks += 1; + + // break if we're out of keys + if (++ii >= atomics->keys) + break; + + // otherwise, process next key + union skc_ttrk const next = keys[ii]; + + // if new cohort then save curr meta and init next meta + if (next.cohort != curr.cohort) + { + fprintf(stderr,"[ %u, %u, %u, %u ]\n", + meta_in.blocks, + meta_in.offset, + meta_in.pk, + meta_in.rk); + + // store back to buffer + metas->inout[curr.cohort].in = meta_in; + + // update meta_in + meta_in.blocks = 0; + meta_in.offset = ii; + meta_in.pk = 0; + meta_in.rk = 0; + } + // otherwise, if same y but new x then increment TTPK count + else if ((next.y == curr.y) && (next.x != curr.x)) + { + meta_in.pk += 1; + +#if 0 + fprintf(stderr,"%3u : %3u : ( %3u, %3u ) -> ( %3u )\n", + jj++,curr.cohort,curr.y,curr.x,next.x); +#endif + } + +#if 0 + fprintf(stderr,"( %3u, %3u )\n",next.y,next.x); +#endif + + curr = next; + } + + fprintf(stderr,"[ %u, %u, %u, %u ]\n", + meta_in.blocks, + meta_in.offset, + meta_in.pk, + meta_in.rk); + + // store back to buffer + metas->inout[curr.cohort].in = meta_in; + + + // unmap + skc_extent_thrw_tdrw_unmap(&cohort->keys, + keys, + cohort->cq, + NULL); + + // unmap + skc_extent_thrw_tdrw_unmap(&cohort->metas, + metas, + cohort->cq, + NULL); +#endif + } + +#ifndef NDEBUG + fprintf(stderr,"rasters_alloc: %u\n",rasters); +#endif + + // + // RASTER ALLOC/INIT + // + cl(SetKernelArg(impl->kernels.rasters_alloc,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,2,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + cl(SetKernelArg(impl->kernels.rasters_alloc,3,SKC_CL_ARG(runtime->handle_pool.map.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,4,SKC_CL_ARG(cohort->metas.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,5,SKC_CL_ARG(cohort->raster_ids.drN))); + cl(SetKernelArg(impl->kernels.rasters_alloc,6,SKC_CL_ARG(rasters))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC, + cohort->cq, + impl->kernels.rasters_alloc, + rasters, + 0,NULL,NULL); + +#ifndef NDEBUG + fprintf(stderr,"post-alloc\n"); +#endif + + // + // PREFIX + // + cl(SetKernelArg(impl->kernels.prefix,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.prefix,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.prefix,2,SKC_CL_ARG(runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.prefix,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.prefix,4,SKC_CL_ARG(cohort->keys.drw))); + cl(SetKernelArg(impl->kernels.prefix,5,SKC_CL_ARG(runtime->handle_pool.map.drw))); + + cl(SetKernelArg(impl->kernels.prefix,6,SKC_CL_ARG(cohort->metas.drw))); + cl(SetKernelArg(impl->kernels.prefix,7,SKC_CL_ARG(rasters))); + + cl_event complete; + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_PREFIX, + cohort->cq, + impl->kernels.prefix, + rasters, + 0,NULL, + &complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_prefix_cb,grid)); + cl(ReleaseEvent(complete)); + +#ifndef NDEBUG + fprintf(stderr,"post-prefix\n"); +#endif + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids a + // raster_ids a + // transforms - + // clips - + // fill_cmds - + // cq a + // cohort atomics a + // cmds - + // keys a + // meta a + // +} + +static +void +skc_raster_cohort_rasterize_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_sort_prefix,grid); +} + +static +void +skc_raster_cohort_rasterize(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds s + // cq a + // cohort atomics a + // cmds a + // cmds_quad a + // cmds_cubic a + // keys - + // meta - + + // use the backpointers + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // + // RELEASED RESOURCES + // + // cmds snap + // + + // release the cmds extent and snap since it's only used by the expand stage + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->fill_cmds); + + // + // NEW ALLOCATED RESOURCES + // + // transforms snap + // clips snap + // ttrk keys + // + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->transforms, + &cohort->transforms, + cohort->cq,NULL); + + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->clips, + &cohort->clips, + cohort->cq,NULL); + + // acquire device-side extent + skc_extent_tdrw_alloc(runtime, + &cohort->keys, + sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); + + // skc_extent_thrw_tdrw_alloc(runtime, + // &cohort->keys, + // sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); + + // + // acquire out-of-order command queue + // + // and launch up to 3 kernels + // + // for each kernel: + // + // set runtime "global" kernel args: + // + // - block pool atomics + // - block pool extent + // + // set cohort "local" kernel args: + // + // - atomics + // - cmds + // + // enqueue barrier + // enqueue copy back of atomics on the command queue + // set callback on copy back event + // release command queue + // + struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; + + if (atomics->cmds > 0) + { + cl(SetKernelArg(impl->kernels.rasterize_all,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,1,SKC_CL_ARG(runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,2,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.rasterize_all,4,SKC_CL_ARG(cohort->atomics.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,5,SKC_CL_ARG(cohort->keys.drw))); + + cl(SetKernelArg(impl->kernels.rasterize_all,6,SKC_CL_ARG(cohort->transforms.drN))); + cl(SetKernelArg(impl->kernels.rasterize_all,7,SKC_CL_ARG(cohort->clips.drN))); + cl(SetKernelArg(impl->kernels.rasterize_all,8,SKC_CL_ARG(cohort->cmds.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,9,SKC_CL_ARG(atomics->cmds))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL, + cohort->cq, + impl->kernels.rasterize_all, + atomics->cmds, + 0,NULL,NULL); + } + + // + // copyback number of TTSK keys + // + cl_event complete; + + skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_rasterize_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms a + // clips a + // fill_cmds - + // cq a + // cohort atomics a + // cmds a + // keys a + // meta - +} + +static +void +skc_raster_cohort_fills_expand_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_rasterize,grid); +} + +static +void +skc_raster_builder_cohort_grid_pfn_execute(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds i + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // + + // allocate the cohort + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // get impl + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // acquire in-order cq + cohort->cq = skc_runtime_acquire_cq_in_order(runtime); + + // alloc the snapshot -- could be zero-sized + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->fill_cmds, + &cohort->fill_cmds, + cohort->cq,NULL); + + // flush the cq to get the fill running + // cl(Flush(cohort->cq)); + + // create split atomics + skc_extent_thr_tdrw_alloc(runtime,&cohort->atomics,sizeof(struct skc_raster_cohort_atomic)); + + // zero the atomics + skc_extent_thr_tdrw_zero(&cohort->atomics,cohort->cq,NULL); + + // get config + struct skc_config const * const config = runtime->config; + + // acquire device-side extents + skc_extent_tdrw_alloc(runtime, + &cohort->cmds, + sizeof(union skc_cmd_rasterize) * config->raster_cohort.expand.cmds); + + // + // FILLS EXPAND + // + // need result of cmd counts before launching RASTERIZE grids + // + // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host + // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device + // - or launch a device-wide grid that feeds itself but that's unsatisfying + // + + // how many commands? could be zero + skc_uint const work_size = skc_extent_ring_snap_count(cohort->fill_cmds.snap); + + if (work_size > 0) + { + cl(SetKernelArg(impl->kernels.fills_expand,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,1,SKC_CL_ARG(cohort->atomics.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,2,SKC_CL_ARG(runtime->handle_pool.map.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,3,SKC_CL_ARG(cohort->fill_cmds.drN))); + cl(SetKernelArg(impl->kernels.fills_expand,4,SKC_CL_ARG(cohort->cmds.drw))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_FILLS_EXPAND, + cohort->cq, + impl->kernels.fills_expand, + work_size, + 0,NULL,NULL); + } + + // + // copyback number of rasterization commands + // + cl_event complete; + + skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_fills_expand_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds s + // cq a + // cohort atomics a + // cmds a + // keys - + // meta - + // +} + +// +// move grid into waiting state +// +// this entails allocating a cohort from the temporary extent +// + +static +void +skc_raster_builder_cohort_grid_pfn_waiting(skc_grid_t const grid) +{ + // get the impl + struct skc_raster_builder_impl * const impl = skc_grid_get_data(grid); + struct skc_runtime * const runtime = impl->runtime; + + // retain the raster builder + impl->raster_builder->refcount += 1; + + // allocate the ephemeral/temp cohort + skc_subbuf_id_t id; + + struct skc_raster_cohort * const cohort = + skc_runtime_host_temp_alloc(runtime, + SKC_MEM_FLAGS_READ_WRITE, + sizeof(*cohort), + &id, + NULL); + + // save the id and backpointer + cohort->id = id; + cohort->impl = impl; + + // set grid data -- replaces impl + skc_grid_set_data(grid,cohort); + + // + // ACQUIRE RESOURCES FOR THE COHORT + // + + struct skc_raster_builder * const raster_builder = impl->raster_builder; + + // immediately take snapshots of all rings -- these are very inexpensive operations + skc_extent_phrwg_thr1s_snap_init(runtime,&raster_builder->path_ids .ring,&cohort->path_ids); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->transforms.ring,&cohort->transforms); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->clips .ring,&cohort->clips); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->fill_cmds .ring,&cohort->fill_cmds); + skc_extent_phrwg_tdrNs_snap_init(runtime,&raster_builder->raster_ids.ring,&cohort->raster_ids); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds i + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // +} + +// +// +// + +static +void +skc_raster_builder_cohort_create(struct skc_raster_builder_impl * const impl) +{ + // attach a grid + impl->cohort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + &impl->cohort, + impl, + skc_raster_builder_cohort_grid_pfn_waiting, + skc_raster_builder_cohort_grid_pfn_execute, + skc_raster_builder_cohort_grid_pfn_dispose); +} + +// +// +// + +static +skc_err +skc_raster_builder_pfn_add(struct skc_raster_builder_impl * const impl, + skc_path_t const * paths, + skc_uint count) +{ + // validate and retain the path + skc_err err; + + err = skc_runtime_handle_device_validate_retain(impl->runtime, + SKC_TYPED_HANDLE_TYPE_IS_PATH, + paths, + count); + + if (err) + return err; + + skc_runtime_handle_device_retain(impl->runtime,paths,count); + + // make sure there is a grid + if (impl->cohort == NULL) { + skc_raster_builder_cohort_create(impl); + } + + // declare rasterization grid happens after path + while (count-- > 0) + skc_grid_happens_after_handle(impl->cohort,SKC_TYPED_HANDLE_TO_HANDLE(*paths++)); + + return SKC_ERR_SUCCESS; +} + +// +// +// + +static +void +skc_raster_builder_pfn_end(struct skc_raster_builder_impl * const impl, skc_raster_t * const raster) +{ + // + // acquire host-managed path raster handle and bump reference count + // to 2 handles will be released (reduced to 1) once the rasters are + // completely rasterized + // + *raster = skc_runtime_handle_device_acquire(impl->runtime); + + // make sure there is a grid + if (impl->cohort == NULL) { + skc_raster_builder_cohort_create(impl); + } + + // map a handle to a grid + skc_grid_map(impl->cohort,*raster); +} + +// +// snapshot the ring and lazily start the grid +// +// FIXME -- might want to revisit this and settle on an even more +// opaque implementation. Some options: +// +// - never let the SKC API expose a forced grid start +// - make snapshots kick off a forced grid start +// - be lazy all the time everywhere +// + +static +void +skc_raster_builder_pfn_start(struct skc_raster_builder_impl * const impl) +{ + skc_grid_t const cohort = impl->cohort; + + if (cohort != NULL) { + skc_grid_start(cohort); + } +} + +// +// NOTE: THIS MIGHT BE REMOVED +// + +static +void +skc_raster_builder_pfn_force(struct skc_raster_builder_impl * const impl) +{ + skc_grid_t const cohort = impl->cohort; + + if (cohort != NULL) { + skc_grid_force(cohort); + } +} + +// +// +// + +skc_err +skc_raster_builder_cl_12_create(struct skc_context * const context, + struct skc_raster_builder * * const raster_builder) +{ + struct skc_runtime * const runtime = context->runtime; + + // allocate raster builder + (*raster_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**raster_builder)); + + // refcount + (*raster_builder)->refcount = 1; + + // state + SKC_ASSERT_STATE_INIT((*raster_builder),SKC_RASTER_BUILDER_STATE_READY); + + // allocate runtime raster builder + struct skc_raster_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + // save the impl + (*raster_builder)->impl = impl; + + // intialize impl + impl->raster_builder = (*raster_builder); + impl->runtime = runtime; + impl->cohort = NULL; + + // get config + struct skc_config const * const config = runtime->config; + + skc_extent_phrwg_thr1s_alloc(runtime,&impl->path_ids ,sizeof(skc_path_t ) * config->raster_cohort.path_ids .elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->transforms,sizeof(union skc_transform) * config->raster_cohort.transforms.elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->clips ,sizeof(union skc_path_clip) * config->raster_cohort.clips .elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->fill_cmds ,sizeof(union skc_cmd_fill ) * config->raster_cohort.fill .elem_count); + skc_extent_phrwg_tdrNs_alloc(runtime,&impl->raster_ids,sizeof(skc_raster_t ) * config->raster_cohort.raster_ids.elem_count); + + // retain the context + //skc_context_retain(context); + + (*raster_builder)->context = context; + + (*raster_builder)->add = skc_raster_builder_pfn_add; + (*raster_builder)->end = skc_raster_builder_pfn_end; + (*raster_builder)->start = skc_raster_builder_pfn_start; + (*raster_builder)->force = skc_raster_builder_pfn_force; + (*raster_builder)->release = skc_raster_builder_pfn_release; + + // initialize raster builder with host-writable buffers + (*raster_builder)->path_ids .extent = impl->path_ids.hrw; + (*raster_builder)->transforms.extent = impl->transforms.hw1; + (*raster_builder)->clips .extent = impl->clips.hw1; + (*raster_builder)->fill_cmds .extent = impl->fill_cmds.hw1; + (*raster_builder)->raster_ids.extent = impl->raster_ids.hrw; + + // + // the rings perform bookkeeping on the extents + // + // the ring snapshotting and checkpointing are necessary because + // another part of the API can _force_ the raster cohort to flush + // its work-in-progress commands but only up to a checkpointed + // boundary + // + skc_extent_ring_init(&(*raster_builder)->path_ids.ring, + config->raster_cohort.path_ids.elem_count, + config->raster_cohort.path_ids.snap_count, + sizeof(skc_path_t)); + + skc_extent_ring_init(&(*raster_builder)->transforms.ring, + config->raster_cohort.transforms.elem_count, + config->raster_cohort.transforms.snap_count, + sizeof(union skc_transform)); + + skc_extent_ring_init(&(*raster_builder)->clips.ring, + config->raster_cohort.clips.elem_count, + config->raster_cohort.clips.snap_count, + sizeof(union skc_path_clip)); + + skc_extent_ring_init(&(*raster_builder)->fill_cmds.ring, + config->raster_cohort.fill.elem_count, + config->raster_cohort.fill.snap_count, + sizeof(union skc_cmd_fill)); + + skc_extent_ring_init(&(*raster_builder)->raster_ids.ring, + config->raster_cohort.raster_ids.elem_count, + config->raster_cohort.raster_ids.snap_count, + sizeof(skc_raster_t)); + + // + // acquire kernels + // + impl->kernels.fills_expand = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_FILLS_EXPAND); + impl->kernels.rasterize_all = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL); + +#if 0 + impl->kernels.rasterize_lines = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES); + impl->kernels.rasterize_quads = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS); + impl->kernels.rasterize_cubics = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS); +#endif + + impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK); + impl->kernels.rasters_alloc = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC); + impl->kernels.prefix = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PREFIX); + + return SKC_ERR_SUCCESS; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h new file mode 100644 index 0000000000..f6e1751ef1 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h @@ -0,0 +1,165 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_RASTER_BUILDER_CL_12_ONCE +#define SKC_RASTER_BUILDER_CL_12_ONCE + +// +// +// + +#include "types.h" +#include "macros.h" +#include "common.h" + +// +// FIXME -- these magic numbers will be replaced with tile.h constants +// although they're probably universal across all devices +// +// FIXME -- NEED TO EVALUATE IF THIS DISTRIBUTION OF BITS IS GOING TO +// BE TOO SMALL -- plenty of room to jiggle these bits +// + +#define SKC_CMD_RASTERIZE_BITS_TRANSFORM 12 +#define SKC_CMD_RASTERIZE_BITS_CLIP 12 +#define SKC_CMD_RASTERIZE_BITS_COHORT 8 + +SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_TRANSFORM == SKC_CMD_FILL_BITS_TRANSFORM); +SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_CLIP == SKC_CMD_FILL_BITS_CLIP); +SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_COHORT == SKC_CMD_FILL_BITS_COHORT); + +// +// device-side rasterization cmd +// + +union skc_cmd_rasterize +{ + skc_ulong u64; + + skc_uint2 u32v2; + + struct { + // + // Unlike anywhere else in the pipeline, the nodeword index points + // "inside" of a path node (with word resolution). This means + // there is up to 16 GB of 32-bit word addressing in a unified + // block pool: + // + // "16GB ought to be enough for anyone" -- ASM 5/30/17 + // + skc_uint nodeword; +#if defined(__OPENCL_C_VERSION__) + skc_uint tcc; +#else + skc_uint transform : SKC_CMD_RASTERIZE_BITS_TRANSFORM; + skc_uint clip : SKC_CMD_RASTERIZE_BITS_CLIP; + skc_uint cohort : SKC_CMD_RASTERIZE_BITS_COHORT; +#endif + }; +}; + +SKC_STATIC_ASSERT(sizeof(union skc_cmd_rasterize) == sizeof(skc_uint2)); + +// +// +// + +#define SKC_CMD_RASTERIZE_HI_OFFSET_COHORT (SKC_CMD_RASTERIZE_BITS_TRANSFORM + SKC_CMD_RASTERIZE_BITS_CLIP) +#define SKC_CMD_RASTERIZE_MASK_COHORT(c) ((c).u32v2.hi & SKC_BITS_TO_MASK_AT(SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)) + +#define SKC_CMD_RASTERIZE_GET_TRANSFORM(c) ((c).u32v2.hi & SKC_BITS_TO_MASK(SKC_CMD_RASTERIZE_BITS_TRANSFORM)) +#define SKC_CMD_RASTERIZE_GET_CLIP(c) SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_CLIP,SKC_CMD_RASTERIZE_BITS_TRANSFORM) +#define SKC_CMD_RASTERIZE_GET_COHORT(c) ((c).u32v2.hi >> SKC_CMD_RASTERIZE_HI_OFFSET_COHORT) +// SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT) + +// +// +// + +#define SKC_TTSK_SIZE_COHORT (1 << SKC_CMD_RASTERIZE_BITS_COHORT) + +// +// COHORT META DATA +// + +union skc_raster_cohort_meta_in +{ + skc_uint4 u32v4; + + struct { + skc_uint blocks; // # of rk blocks + skc_uint offset; // start of rk span + skc_uint pk; // # of pk keys + skc_uint rk; // # of rk keys + }; +}; + +union skc_raster_cohort_meta_out +{ + skc_uint4 u32v4; + + struct { + skc_uint blocks; // # of blocks in raster -- initially just rk blocks + skc_uint offset; // start of rk span + skc_uint nodes; // # of nodes in raster -- necessary for walking + skc_uint keys; // # of rk & pk keys -- initially just rk + }; +}; + +union skc_raster_cohort_meta_inout +{ + union skc_raster_cohort_meta_in in; + union skc_raster_cohort_meta_out out; +}; + +// +// followed by one word for the offset +// + +struct skc_raster_cohort_meta +{ + union skc_raster_cohort_meta_inout inout[SKC_TTSK_SIZE_COHORT]; + skc_uint reads[SKC_TTSK_SIZE_COHORT]; // starting ring reads -- [0] is raster head +}; + +#define SKC_RASTER_COHORT_META_OFFSET_READS (SKC_OFFSET_OF(struct skc_raster_cohort_meta,reads) / sizeof(skc_uint)) + +// +// COHORT ATOMICS +// + +struct skc_raster_cohort_atomic +{ + // rasterization input + skc_uint cmds; + + // rasterization output + skc_uint keys; + + // block pool base -- idea here is to perform one atomic allocation + // skc_uint bp_base; +}; + +#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS 0 +#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS 1 + +#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,cmds) / sizeof(skc_uint)) +#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,keys) / sizeof(skc_uint)) + +SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS == SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC); // verify +SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS == SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC); // verify + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.c b/src/compute/skc/platforms/cl_12/runtime_cl.c new file mode 100644 index 0000000000..a745ed013e --- /dev/null +++ b/src/compute/skc/platforms/cl_12/runtime_cl.c @@ -0,0 +1,362 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include +#include +#include + +// +// +// + +#include "runtime_cl.h" +#include "common/cl/assert_cl.h" + +// +// +// + +static is_verbose = true; + +// +// FIXME -- all variable length device queries need to start querying +// the parameter's return size before getting its value +// +// FIXME -- this is now handled by the common/cl/find.* routine +// + +union skc_cl_device_version { + struct { + cl_uchar opencl_space[7]; // "OpenCL_" + cl_uchar major; + cl_uchar dot; + cl_uchar minor; +#if 1 // Intel NEO requires at least 16 bytes + cl_uchar space; + cl_uchar vendor[32]; +#endif + }; + struct { + cl_uchar aN[]; + }; +}; + +typedef cl_bitfield cl_diagnostic_verbose_level_intel; + +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4 + +static +void +CL_CALLBACK +skc_context_callback(char const * error, void const * info, size_t size, void * user) +{ + if (info != NULL ) + { + fprintf(stderr,"%s\n",error); + } +} + +// +// +// + +skc_err +skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, + char const * const target_platform_substring, + char const * const target_device_substring, + cl_context_properties context_properties[]) +{ + skc_err err = SKC_ERR_SUCCESS; + + // + // search available devices for a match + // +#define PLATFORM_IDS_MAX 16 +#define DEVICE_IDS_MAX 16 +#define PLATFORM_NAME_SIZE_MAX 64 +#define DEVICE_NAME_SIZE_MAX 64 +#define DRIVER_VERSION_SIZE_MAX 64 + + cl_int cl_err; + + cl_platform_id platform_ids[PLATFORM_IDS_MAX]; + cl_device_id device_ids [PLATFORM_IDS_MAX][DEVICE_IDS_MAX]; + + cl_uint platform_count; + cl_uint device_count[PLATFORM_IDS_MAX]; + + cl_uint platform_idx = UINT32_MAX, device_idx = UINT32_MAX; + + bool match = false; // find _first_ match + + // + // get number of platforms + // + cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count)); + + // + // search platforms + // + for (cl_uint ii=0; iiversion.major = device_version.major - 48; + runtime_cl->version.minor = device_version.minor - 48; + runtime_cl->base_align = base_align; + + if (is_verbose) { + fprintf(stdout," >>>"); + } + } + else if (is_verbose) + { + fprintf(stdout," "); + } + + if (is_verbose) { + fprintf(stdout, + " %1u: %s [ %s ] [ %s ] [ %u ]\n", + jj, + device_name, + device_version.aN, + driver_version, + base_align); + } + } + } + + if (is_verbose) { + fprintf(stdout,"\n"); + } + + // + // get target platform and device + // + if (platform_idx >= platform_count) + { + fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring); + exit(EXIT_FAILURE); + } + if (device_idx >= device_count[platform_idx]) + { + fprintf(stderr,"no match for target device substring %s\n",target_device_substring); + exit(EXIT_FAILURE); + } + + runtime_cl->platform_id = platform_ids[platform_idx]; + runtime_cl->device_id = device_ids [platform_idx][device_idx]; + + // + // create context + // + +#if 0 + cl_context_properties context_properties[] = + { + CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id, + 0 + }; +#else + context_properties[1] = (cl_context_properties)runtime_cl->platform_id; +#endif + + runtime_cl->context = clCreateContext(context_properties, + 1, + &runtime_cl->device_id, + skc_context_callback, + NULL, + &cl_err); + cl_ok(cl_err); + + // + // get device name, driver version, and unified memory flag + // + if (is_verbose) + { + char device_name[DEVICE_NAME_SIZE_MAX]; + char driver_version[DRIVER_VERSION_SIZE_MAX]; + cl_bool device_is_unified; + cl_device_svm_capabilities svm_caps; + size_t printf_buffer_size; + + cl(GetDeviceInfo(runtime_cl->device_id, + CL_DEVICE_NAME, + sizeof(device_name), + device_name, + NULL)); + + cl(GetDeviceInfo(runtime_cl->device_id, + CL_DRIVER_VERSION, + sizeof(driver_version), + driver_version, + NULL)); + + cl(GetDeviceInfo(runtime_cl->device_id, + CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(device_is_unified), + &device_is_unified, + NULL)); + + cl(GetDeviceInfo(runtime_cl->device_id, + CL_DEVICE_SVM_CAPABILITIES, + sizeof(svm_caps), + &svm_caps, + 0)); + + cl(GetDeviceInfo(runtime_cl->device_id, + CL_DEVICE_PRINTF_BUFFER_SIZE, + sizeof(printf_buffer_size), + &printf_buffer_size, + NULL)); + + fprintf(stderr, + "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER %c\n" + "CL_DEVICE_SVM_FINE_GRAIN_BUFFER %c\n" + "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM %c\n" + "CL_DEVICE_SVM_ATOMICS %c\n" + "CL_DEVICE_PRINTF_BUFFER_SIZE %zu\n\n", + svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-', + svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? '*' : '-', + svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? '*' : '-', + svm_caps & CL_DEVICE_SVM_ATOMICS ? '*' : '-', + printf_buffer_size); + } + + return err; +} + +// +// +// + +skc_err +skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl) +{ + // FIXME + printf("%s incomplete!\n",__func__); + + return SKC_ERR_SUCCESS; +} + +// +// +// + +cl_command_queue +skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type) +{ + cl_command_queue cq; + + if (runtime_cl->version.major < 2) + { + // + // <= OpenCL 1.2 + // + cl_int cl_err; + + cq = clCreateCommandQueue(runtime_cl->context, + runtime_cl->device_id, + (cl_command_queue_properties)type, + &cl_err); cl_ok(cl_err); + } + else + { + // + // >= OpenCL 2.0 + // + cl_int cl_err; + cl_queue_properties const queue_properties[] = { + CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0 + }; + + cq = clCreateCommandQueueWithProperties(runtime_cl->context, + runtime_cl->device_id, + queue_properties, + &cl_err); cl_ok(cl_err); + } + + return cq; +} + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.h b/src/compute/skc/platforms/cl_12/runtime_cl.h new file mode 100644 index 0000000000..9e58ca0cc7 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/runtime_cl.h @@ -0,0 +1,79 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// squelch OpenCL 1.2 deprecation warning +// + +#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#endif + +#include + +// +// +// + +#include "skc.h" + +// +// Minimal OpenCL state needed by the runtime to get started +// + +struct skc_runtime_cl +{ + cl_platform_id platform_id; + cl_device_id device_id; + cl_context context; + + struct { + cl_uint major; + cl_uint minor; + } version; // sometimes we need to know this at runtime + + cl_uint base_align; // base address alignment for subbuffer origins +}; + +// +// +// + +typedef enum skc_cq_type_e { + SKC_CQ_TYPE_IN_ORDER = 0, + SKC_CQ_TYPE_OUT_OF_ORDER = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + SKC_CQ_TYPE_IN_ORDER_PROFILING = (SKC_CQ_TYPE_IN_ORDER | CL_QUEUE_PROFILING_ENABLE), + SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE), +} skc_cq_type_e; + +// +// safely creates a generic OpenCL target in very few lines +// + +skc_err +skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, + char const * const target_platform_substring, + char const * const target_device_substring, + cl_context_properties context_properties[]); + +skc_err +skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl); + +// +// create a command queue with the non-deprecated function +// + +cl_command_queue +skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type); + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c new file mode 100644 index 0000000000..fca13edbbd --- /dev/null +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c @@ -0,0 +1,314 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include +#include +#include +#include + +// +// +// + +#include "context.h" +#include "block.h" +#include "grid.h" +#include "common/cl/assert_cl.h" +#include "config_cl.h" +#include "runtime_cl.h" +#include "runtime_cl_12.h" +#include "export_cl_12.h" + +// +// +// + +static +void +skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq) +{ + // save size + runtime->block_pool.size = &runtime->config->block_pool; + + // create block extent + skc_extent_pdrw_alloc(runtime, + &runtime->block_pool.blocks, + runtime->block_pool.size->pool_size * + runtime->config->block.bytes); + + // allocate block pool ids + skc_extent_pdrw_alloc(runtime, + &runtime->block_pool.ids, + runtime->block_pool.size->ring_pow2 * sizeof(skc_uint)); + + // allocate block pool atomics + skc_extent_phr_pdrw_alloc(runtime, + &runtime->block_pool.atomics, + sizeof(union skc_block_pool_atomic)); + + // acquire pool id and atomic initialization kernels + cl_kernel k0 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS); + cl_kernel k1 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS); + + // init ids + cl(SetKernelArg(k0,0,sizeof(runtime->block_pool.ids.drw),&runtime->block_pool.ids.drw)); + cl(SetKernelArg(k0,1,SKC_CL_ARG(runtime->block_pool.size->pool_size))); + + // the kernel grid is shaped by the target device -- always 2 for atomics + skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS, + cq,k0,runtime->block_pool.size->pool_size, + 0,NULL,NULL); + + // init atomics + cl(SetKernelArg(k1,0,sizeof(runtime->block_pool.atomics.drw),&runtime->block_pool.atomics.drw)); + cl(SetKernelArg(k1,1,SKC_CL_ARG(runtime->block_pool.size->pool_size))); + + // the kernel grid is shaped by the target device + skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS, + cq,k1,2, + 0,NULL,NULL); + + // kickstart kernel execution + cl(Flush(cq)); + + // release kernels + cl(ReleaseKernel(k0)); + cl(ReleaseKernel(k1)); +} + +static +void +skc_block_pool_dispose(struct skc_runtime * const runtime) +{ + skc_extent_phr_pdrw_free(runtime,&runtime->block_pool.atomics); + skc_extent_pdrw_free (runtime,&runtime->block_pool.ids); + skc_extent_pdrw_free (runtime,&runtime->block_pool.blocks); +} + +// +// +// + +static +bool +skc_runtime_yield(struct skc_runtime * const runtime) +{ + return skc_scheduler_yield(runtime->scheduler); +} + +static +void +skc_runtime_wait(struct skc_runtime * const runtime) +{ + skc_scheduler_wait(runtime->scheduler); +} + +// +// +// + +skc_err +skc_runtime_cl_12_create(struct skc_context * const context, + char const * const target_platform_substring, + char const * const target_device_substring, + cl_context_properties context_properties[]) +{ + // allocate the runtime + struct skc_runtime * const runtime = malloc(sizeof(*runtime)); + + // acquire OpenCL ids and context for target device + skc_err err = skc_runtime_cl_create(&runtime->cl, + target_platform_substring, + target_device_substring, + context_properties); + + // create device + skc_device_create(runtime); + + // create the host and device allocators + skc_allocator_host_create(runtime); + skc_allocator_device_create(runtime); + + // how many slots in the scheduler? + runtime->scheduler = skc_scheduler_create(runtime,runtime->config->scheduler.size); + + // allocate deps structure + runtime->deps = skc_grid_deps_create(runtime, + runtime->scheduler, + runtime->config->block_pool.pool_size); + + // initialize cq pool + skc_cq_pool_create(runtime, + &runtime->cq_pool, + runtime->config->cq_pool.type, + runtime->config->cq_pool.size); + + // acquire in-order cq + cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); + + // initialize block pool + skc_block_pool_create(runtime,cq); + + // intialize handle pool + skc_handle_pool_create(runtime, + &runtime->handle_pool, + runtime->config->handle_pool.size, + runtime->config->handle_pool.width, + runtime->config->handle_pool.recs); + + // + // initialize pfns + // + // FIXME -- at this point we will have identified which device we've + // targeted and will load a DLL (or select from a built-in library) + // that contains all the pfns. + // + context->runtime = runtime; + + context->yield = skc_runtime_yield; + context->wait = skc_runtime_wait; + + context->path_builder = skc_path_builder_cl_12_create; + context->path_retain = skc_runtime_path_host_retain; + context->path_release = skc_runtime_path_host_release; + context->path_flush = skc_runtime_path_host_flush; + + context->raster_builder = skc_raster_builder_cl_12_create; + context->raster_retain = skc_runtime_raster_host_retain; + context->raster_release = skc_runtime_raster_host_release; + context->raster_flush = skc_runtime_raster_host_flush; + + context->composition = skc_composition_cl_12_create; + context->styling = skc_styling_cl_12_create; + + context->surface = skc_surface_cl_12_create; + + // block on pool creation + cl(Finish(cq)); + + // dispose of in-order cq + skc_runtime_release_cq_in_order(runtime,cq); + + return err; +}; + +// +// +// + +skc_err +skc_runtime_cl_12_dispose(struct skc_context * const context) +{ + // + // FIXME -- incomplete + // + fprintf(stderr,"%s incomplete!\n",__func__); + + struct skc_runtime * runtime = context->runtime; + + skc_allocator_device_dispose(runtime); + skc_allocator_host_dispose(runtime); + + skc_scheduler_dispose(context->runtime,context->runtime->scheduler); + + skc_grid_deps_dispose(context->runtime->deps); + + skc_cq_pool_dispose(runtime,&runtime->cq_pool); + + skc_block_pool_dispose(context->runtime); + + // skc_handle_pool_dispose(context->runtime); + + return SKC_ERR_SUCCESS; +} + +// +// TEMPORARY BENCHMARK +// + +#if 1 + +#include + +#define SKC_FRAMES_MASK 0x7F +#define SKC_FRAMES (SKC_FRAMES_MASK + 1) + +void +skc_runtime_cl_12_debug(struct skc_context * const context) +{ +#ifdef NDEBUG + static skc_uint frames=0; + static LARGE_INTEGER StartingTime={0}, EndingTime; + + if ((frames++ & SKC_FRAMES_MASK) != SKC_FRAMES_MASK) + return; + + QueryPerformanceCounter(&EndingTime); + + LARGE_INTEGER ElapsedMicroseconds, Frequency; + + ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; + + QueryPerformanceFrequency(&Frequency); + + double const msecs_total = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart; + double const msecs_frame = msecs_total / SKC_FRAMES; + + printf("Frames / Total / Per : %u / %.3f / %.3f\n", + SKC_FRAMES,msecs_total,msecs_frame); +#endif + + struct skc_runtime * const runtime = context->runtime; + + // acquire out-of-order cq + cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); + + // copy atomics to host + skc_extent_phr_pdrw_read(&runtime->block_pool.atomics,cq,NULL); + + // block until complete + cl(Finish(cq)); + + // dispose of out-of-order cq + skc_runtime_release_cq_in_order(runtime,cq); + + union skc_block_pool_atomic const * const bp_atomic = runtime->block_pool.atomics.hr; + + skc_uint const available = bp_atomic->writes - bp_atomic->reads; + skc_uint const inuse = runtime->config->block_pool.pool_size - available; + + fprintf(stderr,"w/r/f/a: %9u - %9u = %9u : %6.2f MB\n", + bp_atomic->writes, + bp_atomic->reads, + available, + (inuse * runtime->config->block.bytes) / (1024.0*1024.0)); + + if (available >= (1<<27)) + { + fprintf(stderr,"block pool corrupted!\n"); + exit(-1); + } + + // + // + // +#ifdef NDEBUG + QueryPerformanceCounter(&StartingTime); +#endif +} + +#endif + +// +// +// + diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h new file mode 100644 index 0000000000..7e7ffcb284 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h @@ -0,0 +1,177 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include "runtime.h" +#include "runtime_cl.h" +#include "cq_pool_cl.h" +#include "handle_pool_cl_12.h" +#include "block_pool_cl_12.h" +#include "allocator_device_cl.h" + +// +// FIXME -- two parts: +// +// 1. directly access the structures in the runtime sub-struct implementations +// 2. possibly wall off the non-platform-specific structs into a sub structure +// + +struct skc_runtime +{ + // + // state visible to device + // + struct skc_runtime_cl cl; + + struct { + struct skc_allocator_host host; + struct skc_allocator_device device; + } allocator; + + struct skc_cq_pool cq_pool; + + struct skc_block_pool block_pool; + + struct skc_handle_pool handle_pool; + + // + // state that is slightly opaque (for now) + // + struct skc_scheduler * scheduler; + + struct skc_grid_deps * deps; + + struct skc_config const * config; // FIXME: config will be determined by device with some opportunities to resize + + struct skc_device * device; // opaque bundle of kernels +}; + +// +// Creation and disposal intitializes context and may rely on other +// context resources like the scheduler +// + +skc_err +skc_runtime_cl_12_create(struct skc_context * const context, + char const * const target_platform_substring, + char const * const target_device_substring, + cl_context_properties context_properties[]); + +skc_err +skc_runtime_cl_12_dispose(struct skc_context * const context); + +// +// HOST HANDLE RETAIN/RELEASE/FLUSH +// + +skc_err +skc_runtime_path_host_retain(struct skc_runtime * const runtime, + skc_path_t const * paths, + uint32_t count); + +skc_err +skc_runtime_raster_host_retain(struct skc_runtime * const runtime, + skc_raster_t const * rasters, + uint32_t count); + + +skc_err +skc_runtime_path_host_release(struct skc_runtime * const runtime, + skc_path_t const * paths, + uint32_t count); + +skc_err +skc_runtime_raster_host_release(struct skc_runtime * const runtime, + skc_raster_t const * rasters, + uint32_t count); + + +skc_err +skc_runtime_path_host_flush(struct skc_runtime * const runtime, + skc_path_t const * paths, + uint32_t count); + +skc_err +skc_runtime_raster_host_flush(struct skc_runtime * const runtime, + skc_raster_t const * rasters, + uint32_t count); + +// +// DEVICE/PIPELINE HANDLE ACQUIRE/RETAIN/RELEASE +// +// The retain operations pre-validate handles +// + +skc_handle_t +skc_runtime_handle_device_acquire(struct skc_runtime * const runtime); + +skc_err +skc_runtime_handle_device_validate_retain(struct skc_runtime * const runtime, + skc_typed_handle_type_e const handle_type, + skc_typed_handle_t const * typed_handles, + uint32_t count); + +void +skc_runtime_handle_device_retain(struct skc_runtime * const runtime, + skc_handle_t const * handles, + uint32_t count); + +void +skc_runtime_path_device_release(struct skc_runtime * const runtime, + skc_handle_t const * handles, + uint32_t count); + +void +skc_runtime_raster_device_release(struct skc_runtime * const runtime, + skc_handle_t const * handles, + uint32_t count); + +// +// We only use in-order command queues in the pipeline +// + +cl_command_queue +skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime); + +void +skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, + cl_command_queue cq); + +// +// DEVICE MEMORY ALLOCATION +// + +cl_mem +skc_runtime_device_perm_alloc(struct skc_runtime * const runtime, + cl_mem_flags const flags, + size_t const size); + +void +skc_runtime_device_perm_free(struct skc_runtime * const runtime, + cl_mem const mem); + +cl_mem +skc_runtime_device_temp_alloc(struct skc_runtime * const runtime, + cl_mem_flags const flags, + size_t const size, + skc_subbuf_id_t * const subbuf_id, + size_t * const subbuf_size); + +void +skc_runtime_device_temp_free(struct skc_runtime * const runtime, + cl_mem const mem, + skc_subbuf_id_t const subbuf_id); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/styling_cl_12.c b/src/compute/skc/platforms/cl_12/styling_cl_12.c new file mode 100644 index 0000000000..6c84fe6f70 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/styling_cl_12.c @@ -0,0 +1,339 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTES: +// +// - this particular object only needs a command queue for a short +// time so consider acquiring/releasing the command queue on demand +// but only if command queues are cached and expensive to keep +// + +#include "common/cl/assert_cl.h" + +#include "styling_cl_12.h" +#include "extent_cl_12.h" +#include "runtime_cl_12.h" + +#include "context.h" +#include "styling_types.h" + +// +// +// + +static +void +skc_styling_unmap_complete(skc_grid_t const grid) +{ + struct skc_styling_impl * const impl = skc_grid_get_data(grid); + + impl->state = SKC_STYLING_STATE_SEALED; + + skc_grid_complete(grid); +} + +static +void +skc_styling_unmap_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_styling_impl * const impl = skc_grid_get_data(grid); + struct skc_scheduler * const scheduler = impl->runtime->scheduler; + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(scheduler,skc_styling_unmap_complete,grid); +} + +static +void +skc_styling_grid_pfn_execute(skc_grid_t const grid) +{ + struct skc_styling_impl * const impl = skc_grid_get_data(grid); + struct skc_styling * const styling = impl->styling; + + // + // unmap all extents + // + cl_event complete; + + skc_extent_phwN_pdrN_unmap(&impl->layers,styling->layers.extent,impl->cq,NULL); + skc_extent_phwN_pdrN_unmap(&impl->groups,styling->groups.extent,impl->cq,NULL); + skc_extent_phwN_pdrN_unmap(&impl->extras,styling->extras.extent,impl->cq,&complete); + + // set the event + cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unmap_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(impl->cq)); +} + +// +// +// + +static +void +skc_styling_pfn_seal(struct skc_styling_impl * const impl) +{ + // return if sealing or sealed + if (impl->state >= SKC_STYLING_STATE_SEALING) + return; + + struct skc_runtime * const runtime = impl->runtime; + struct skc_scheduler * const scheduler = runtime->scheduler; + + // + // otherwise, wait for UNSEALING > UNSEALED transition + // + if (impl->state == SKC_STYLING_STATE_UNSEALING) + { + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); + } + + // + // we're unsealed so we need to seal and start the grid + // + impl->state = SKC_STYLING_STATE_SEALING; + impl->grid = SKC_GRID_DEPS_ATTACH(runtime->deps, + NULL, + impl, + NULL, // no waiting + skc_styling_grid_pfn_execute, + NULL); // no dispose + + // no need to force -- styling has no dependencies + skc_grid_start(impl->grid); +} + +// +// +// + +void +skc_styling_unseal_complete(struct skc_styling_impl * const impl) +{ + struct skc_runtime * const runtime = impl->runtime; + + // we're now unsealed + impl->state = SKC_STYLING_STATE_UNSEALED; +} + +static +void +skc_styling_unseal_cb(cl_event event, cl_int status, struct skc_styling_impl * const impl) +{ + SKC_CL_CB(status); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_styling_unseal_complete,impl); +} + +static +void +skc_styling_pfn_unseal(struct skc_styling_impl * const impl, skc_bool const block) +{ + // return if already unsealed + if (impl->state == SKC_STYLING_STATE_UNSEALED) + return; + + // + // otherwise, we're going to need to pump the scheduler + // + struct skc_runtime * const runtime = impl->runtime; + struct skc_scheduler * const scheduler = runtime->scheduler; + + // + // wait for UNSEALING > UNSEALED transition + // + if (impl->state == SKC_STYLING_STATE_UNSEALING) + { + if (block) { + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); + } + return; + } + + // + // otherwise, wait for SEALING > SEALED transition ... + // + if (impl->state == SKC_STYLING_STATE_SEALING) + { + // wait if sealing + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED); + } + + // wait for rendering locks to be released + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0); + + // ... and then unseal the styling object + impl->state = SKC_STYLING_STATE_UNSEALING; + + // defensively NULL the grid reference + impl->grid = NULL; // defensive + + // set styling pointers with mapped extents + cl_event complete; + + struct skc_styling * const styling = impl->styling; + + styling->layers.extent = skc_extent_phwN_pdrN_map(&impl->layers,impl->cq,NULL); + styling->groups.extent = skc_extent_phwN_pdrN_map(&impl->groups,impl->cq,NULL); + styling->extras.extent = skc_extent_phwN_pdrN_map(&impl->extras,impl->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unseal_cb,impl)); + cl(ReleaseEvent(complete)); + + // flush it + cl(Flush(impl->cq)); + + // wait until unsealed... + if (block) { + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); + } +} + +// +// +// + +static +void +skc_styling_pfn_release(struct skc_styling_impl * const impl) +{ + if (--impl->styling->ref_count != 0) + return; + + // + // otherwise, unmap all resources by sealing and delete + // + skc_styling_pfn_seal(impl); + + struct skc_runtime * const runtime = impl->runtime; + struct skc_scheduler * const scheduler = runtime->scheduler; + + // wait until sealed + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED); + + // wait for locks to drain + SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0) + + // + // styling is now disposable + // + + // free styling host + skc_runtime_host_perm_free(runtime,impl->styling); + + // release the cq + skc_runtime_release_cq_in_order(runtime,impl->cq); + + // free extents + skc_extent_phwN_pdrN_free(runtime,&impl->layers); + skc_extent_phwN_pdrN_free(runtime,&impl->groups); + skc_extent_phwN_pdrN_free(runtime,&impl->extras); + + // free styling impl + skc_runtime_host_perm_free(runtime,impl); +} + +// +// +// + +void +skc_styling_retain_and_lock(struct skc_styling * const styling) +{ + skc_styling_retain(styling); + + styling->impl->lock_count += 1; +} + +void +skc_styling_unlock_and_release(struct skc_styling * const styling) +{ + styling->impl->lock_count -= 1; + + skc_styling_pfn_release(styling->impl); +} + +// +// +// + +skc_err +skc_styling_cl_12_create(struct skc_context * const context, + struct skc_styling * * const styling, + skc_uint const layers_count, + skc_uint const groups_count, + skc_uint const extras_count) +{ + // retain the context + // skc_context_retain(context); + + // allocate the impl + struct skc_runtime * const runtime = context->runtime; + struct skc_styling_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + // allocate styling + (*styling) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**styling)); + (*styling)->context = context; + (*styling)->impl = impl; + + // intialize impl + impl->styling = (*styling); + impl->runtime = runtime; + + SKC_ASSERT_STATE_INIT(impl,SKC_STYLING_STATE_SEALED); + + impl->lock_count = 0; + + impl->cq = skc_runtime_acquire_cq_in_order(runtime); + + // + // The styling object is unique in that the API lets the user + // specify resource limits + // + // The styling object is a simple container that can have wildly + // varying resource requirements (but still relatively modest). + // + // Additionally, an advanced SKC programmer may want to create many + // styling and composition objects as they're relatively cheap. + // + skc_extent_phwN_pdrN_alloc(runtime,&impl->layers,sizeof(*(*styling)->layers.extent) * layers_count); + skc_extent_phwN_pdrN_alloc(runtime,&impl->groups,sizeof(*(*styling)->groups.extent) * groups_count); + skc_extent_phwN_pdrN_alloc(runtime,&impl->extras,sizeof(*(*styling)->extras.extent) * extras_count); + + // initialize styling + (*styling)->layers.size = layers_count; + (*styling)->groups.size = groups_count; + (*styling)->extras.size = extras_count; + + (*styling)->layers.count = 0; + (*styling)->groups.count = 0; + (*styling)->extras.count = 0; + + // save pfns + (*styling)->seal = skc_styling_pfn_seal; + (*styling)->unseal = skc_styling_pfn_unseal; + (*styling)->release = skc_styling_pfn_release; + + // set ref count + (*styling)->ref_count = 1; + + // map the extents by unsealing + skc_styling_pfn_unseal(impl,false); + + return SKC_ERR_SUCCESS; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/styling_cl_12.h b/src/compute/skc/platforms/cl_12/styling_cl_12.h new file mode 100644 index 0000000000..a319568ee5 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/styling_cl_12.h @@ -0,0 +1,73 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +#include "styling.h" +#include "grid.h" +#include "extent_cl_12.h" +#include "assert_state.h" + +// +// styling states +// + +typedef enum skc_styling_state_e { + + SKC_STYLING_STATE_UNSEALING, + SKC_STYLING_STATE_UNSEALED, + SKC_STYLING_STATE_SEALING, + SKC_STYLING_STATE_SEALED + +} skc_styling_state_e; + +// +// IMPL +// + +struct skc_styling_impl +{ + struct skc_styling * styling; + struct skc_runtime * runtime; + + SKC_ASSERT_STATE_DECLARE(skc_styling_state_e); + + skc_int lock_count; // # of wip renders + + skc_grid_t grid; + + // in-order command queue + cl_command_queue cq; + + // + // only 3 extents + // + struct skc_extent_phwN_pdrN layers; + struct skc_extent_phwN_pdrN groups; + struct skc_extent_phwN_pdrN extras; +}; + +// +// ONLY VISIBLE WITHIN THIS RUNTIME +// + +void +skc_styling_retain_and_lock(struct skc_styling * const styling); + +void +skc_styling_unlock_and_release(struct skc_styling * const styling); + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12.h b/src/compute/skc/platforms/cl_12/surface_cl_12.h new file mode 100644 index 0000000000..43ea5428a5 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/surface_cl_12.h @@ -0,0 +1,32 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_SURFACE_CL_12_ONCE +#define SKC_SURFACE_CL_12_ONCE + +// +// Unlike other object platform implementations, the surface object +// implementation needs to access the opaque platform-specfic outputs +// of the composition and styling objects. +// +// Composition : { keys, offsets, key_count, offset_count } +// Styling : { layers, groups, commands } +// +// With the OpenCL platform we'll handle this by simply exposing the +// argument value (void*) and its size (size_t). +// +// TODO: It might make sense in the future to support more complex +// rendering jobs that simultaneously involve multiple surfaces, +// compositions and stylings. +// + +#endif + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c new file mode 100644 index 0000000000..cc7cba5225 --- /dev/null +++ b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c @@ -0,0 +1,453 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "common/cl/assert_cl.h" + +#include "extent_cl_12.h" +#include "runtime_cl_12.h" +#include "styling_cl_12.h" +#include "composition_cl_12.h" + +#include "context.h" +#include "surface.h" + +// +// +// + +#include + +// +// BUILD +// + +struct skc_surface_impl +{ + struct skc_surface * surface; + struct skc_runtime * runtime; + + // framebuffer + // struct skc_extent_pdrw fb; + // struct skc_extent_phrN_pdwN fb; + + // for now, a single in-order command queue + cl_command_queue cq; + + struct { + cl_kernel render; + } kernels; +}; + +// +// we might want concurrent access to the same surface as long as +// the clips don't overlap. +// +// this would require acquiring a cq on demand when it is determined +// that the clipped render won't overlap +// +// { tile clip , cq } pair +// +// skc_uint4 clip; +// cl_command_queue cq +// + +struct skc_surface_render +{ + skc_uint clip[4]; + + struct skc_surface_impl * impl; + struct skc_styling * styling; + struct skc_composition * composition; + + skc_surface_render_pfn_notify notify; + void * data; + + cl_mem fb; + + skc_grid_t grid; + + skc_subbuf_id_t id; +}; + +// +// +// + +static +void +skc_surface_pfn_clear(struct skc_surface_impl * const impl, + float const rgba[4], + skc_uint const rect[4], + void * fb) +{ + size_t const origin[3] = { rect[0], rect[1], 0 }; + size_t const region[3] = { rect[2], rect[3], 1 }; + + cl(EnqueueFillImage(impl->cq, + (cl_mem)fb, + rgba, + origin, + region, + 0,NULL,NULL)); +} + +// +// +// + +static +void +skc_surface_pfn_blit(struct skc_surface_impl * const impl, + skc_uint const rect[4], + skc_int const txty[2]) +{ + ; +} + +// +// +// + +#if 0 // #ifndef NDEBUG +#define SKC_SURFACE_DEBUG +#endif + +#ifdef SKC_SURFACE_DEBUG + +#define SKC_SURFACE_WIDTH 4096 +#define SKC_SURFACE_HEIGHT 4096 + +static +void +skc_surface_debug(struct skc_surface_impl * const impl) +{ + // + // MAP + // + cl_uchar4 * const rgba = skc_extent_phrN_pdwN_map(&impl->fb, + impl->cq, + NULL); + cl(Finish(impl->cq)); + + // + // WRITE + // + FILE* file; + + errno_t ferr = fopen_s(&file,"surface.ppm","wb"); + + fprintf(file,"P6\n%u %u\n255\n",SKC_SURFACE_WIDTH,SKC_SURFACE_HEIGHT); + + for (skc_uint ii=0; iifb,rgba,impl->cq,NULL); + + cl(Flush(impl->cq)); +} + +#endif + +// +// +// + +void +skc_surface_render_complete(struct skc_surface_render * const render) +{ +#ifdef SKC_SURFACE_DEBUG + // write fb out + skc_surface_debug(render->impl); +#endif + + // notify + if (render->notify != NULL) { + render->notify(render->impl->surface, + render->styling, + render->composition, + render->data); + } + + // unlock and release the styling and composition + skc_styling_unlock_and_release(render->styling); + skc_composition_unlock_and_release(render->composition); + + // grid is now complete + skc_grid_complete(render->grid); +} + +static +void +skc_surface_render_cb(cl_event event, cl_int status, struct skc_surface_render * const render) +{ + SKC_CL_CB(status); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(render->impl->runtime->scheduler, + skc_surface_render_complete, + render); +} + +// +// +// + +static +void +skc_surface_grid_pfn_execute(skc_grid_t const grid) +{ + struct skc_surface_render * const render = skc_grid_get_data(grid); + struct skc_surface_impl * const impl = render->impl; + struct skc_runtime * const runtime = impl->runtime; + + // get the composition args + struct skc_composition_impl * const ci = render->composition->impl; + struct skc_place_atomics * const atomics = ci->atomics.hr; + + if (atomics->offsets > 0) + { + // acquire the rbo + cl(EnqueueAcquireGLObjects(impl->cq,1,&render->fb,0,NULL,NULL)); + + // get the styling args + struct skc_styling_impl * const si = render->styling->impl; + + cl(SetKernelArg(impl->kernels.render,0,SKC_CL_ARG(si->layers.drN))); + cl(SetKernelArg(impl->kernels.render,1,SKC_CL_ARG(si->groups.drN))); + cl(SetKernelArg(impl->kernels.render,2,SKC_CL_ARG(si->extras.drN))); + + cl(SetKernelArg(impl->kernels.render,3,SKC_CL_ARG(ci->keys.drw))); + cl(SetKernelArg(impl->kernels.render,4,SKC_CL_ARG(atomics->keys))); + cl(SetKernelArg(impl->kernels.render,5,SKC_CL_ARG(ci->offsets.drw))); + cl(SetKernelArg(impl->kernels.render,6,SKC_CL_ARG(atomics->offsets))); + + // block pool + cl(SetKernelArg(impl->kernels.render,7,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + + // surface + cl(SetKernelArg(impl->kernels.render,8,SKC_CL_ARG(render->fb))); + +#if 1 + // tile clip + cl(SetKernelArg(impl->kernels.render,9,sizeof(skc_uint4),render->clip)); +#else + // surface pitch (height) + skc_uint const surface_pitch = SKC_SURFACE_HEIGHT; + cl(SetKernelArg(impl->kernels.render,9,SKC_CL_ARG(surface_pitch))); + // tile clip + cl(SetKernelArg(impl->kernels.render,10,sizeof(skc_uint4),render->clip)); +#endif + + // launch render kernel + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_RENDER, + impl->cq, + impl->kernels.render, + atomics->offsets, + 0,NULL,NULL); + + + cl_event complete; + + // give the rbo back + cl(EnqueueReleaseGLObjects(impl->cq,1,&render->fb,0,NULL,&complete)); + + // notify anyone listening... + cl(SetEventCallback(complete,CL_COMPLETE,skc_surface_render_cb,render)); + cl(ReleaseEvent(complete)); + + // flush it + cl(Flush(impl->cq)); + } + else + { + skc_surface_render_complete(render); + } +} + +// +// +// + +static +void +skc_surface_pfn_release(struct skc_surface_impl * const impl) +{ + if (--impl->surface->ref_count != 0) + return; + + // + // otherwise, release all resources + // + + // drain the command queue + cl(Finish(impl->cq)); + + struct skc_runtime * const runtime = impl->runtime; + + // release the kernel + cl(ReleaseKernel(impl->kernels.render)); + + // free surface host + skc_runtime_host_perm_free(runtime,impl->surface); + + // release the cq + skc_runtime_release_cq_in_order(runtime,impl->cq); + + // release fb + // skc_extent_phrN_pdwN_free(runtime,&impl->fb); + + // free surface impl + skc_runtime_host_perm_free(runtime,impl); +} + +// +// +// + +static +void +skc_surface_grid_pfn_dispose(skc_grid_t const grid) +{ + struct skc_surface_render * const render = skc_grid_get_data(grid); + struct skc_surface_impl * const impl = render->impl; + struct skc_runtime * const runtime = impl->runtime; + + // free the render object + skc_runtime_host_temp_free(runtime,render,render->id); + + // release the surface + skc_surface_pfn_release(impl); +} + +// +// +// + +static +void +skc_surface_pfn_render(struct skc_surface_impl * const impl, + uint32_t const clip[4], + skc_styling_t styling, + skc_composition_t composition, + skc_surface_render_pfn_notify notify, + void * data, + void * fb) +{ + // retain surface + skc_surface_retain(impl->surface); + + // + // FIXME -- we used to seal the styling and composition objects if + // they weren't already. Either test that they're sealed or seal + // them here. + // + + // retain and lock the styling and composition + skc_styling_retain_and_lock(styling); + skc_composition_retain_and_lock(composition); + + // + // allocate a render instance + // + skc_subbuf_id_t id; + struct skc_surface_render * const render = skc_runtime_host_temp_alloc(impl->runtime, + SKC_MEM_FLAGS_READ_WRITE, + sizeof(*render),&id,NULL); + render->id = id; + + render->clip[0] = clip[0]; + render->clip[1] = clip[1]; + render->clip[2] = clip[2]; + render->clip[3] = clip[3]; + + render->impl = impl; + render->styling = styling; + render->composition = composition; + + render->notify = notify; + render->data = data; + + render->fb = fb; + + render->grid = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + NULL, // invalidation not necessary + render, + NULL, // no waiting + skc_surface_grid_pfn_execute, + skc_surface_grid_pfn_dispose); + + // declare happens-after relationships + skc_grid_happens_after_grid(render->grid,styling->impl->grid); + skc_grid_happens_after_grid(render->grid,composition->impl->grids.sort); + + // wait for styling and composition + skc_grid_start(render->grid); +} + +// +// +// + +skc_err +skc_surface_cl_12_create(struct skc_context * const context, + struct skc_surface * * const surface) +{ + struct skc_runtime * const runtime = context->runtime; + + // allocate surface + (*surface) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**surface)); + + // allocate impl + struct skc_surface_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + // initialize surface + // SKC_ASSERT_STATE_INIT((*impl),SKC_SURFACE_STATE_READY); + + (*surface)->context = context; + (*surface)->impl = impl; + (*surface)->ref_count = 1; + + (*surface)->release = skc_surface_pfn_release; + (*surface)->clear = skc_surface_pfn_clear; + (*surface)->blit = skc_surface_pfn_blit; + (*surface)->render = skc_surface_pfn_render; + + // intialize impl + impl->surface = *surface; + impl->runtime = runtime; + +#if 0 + // FIXME -- 4K x 4K -- temporarily fixed size + size_t const fb_size = sizeof(skc_uchar4) * SKC_SURFACE_WIDTH * SKC_SURFACE_HEIGHT; + + // create framebuffer + skc_extent_phrN_pdwN_alloc(runtime,&impl->fb,fb_size); +#endif + + // acquire a command queue + impl->cq = skc_runtime_acquire_cq_in_order(runtime); + + // acquire kernel + impl->kernels.render = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_RENDER); + + return SKC_ERR_SUCCESS; +} + +// +// +// diff --git a/src/compute/skc/prefix.cl b/src/compute/skc/prefix.cl deleted file mode 100644 index 960b6cf5ff..0000000000 --- a/src/compute/skc/prefix.cl +++ /dev/null @@ -1,1042 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block.h" -#include "raster.h" -#include "atomic_cl.h" -#include "macros.h" -#include "tile.h" - -// -// INPUT: -// -// TTRK (64-BIT COMPARE) -// -// 0 63 -// | TTSB ID | X | Y | COHORT ID | -// +---------+------+------+-----------+ -// | 27 | 12 | 12 | 13 | -// -// -// TTRK (32-BIT COMPARE) -// -// 0 63 -// | TTSB ID | N/A | X | Y | COHORT ID | -// +---------+-----+------+------+-----------+ -// | 27 | 5 | 12 | 12 | 8 | -// -// -// OUTPUT: -// -// TTSK v2: -// -// 0 63 -// | TTSB ID | PREFIX | N/A | X | Y | -// +---------+--------+------+----+----+ -// | 27 | 1 (=0) | 12 | 12 | 12 | -// -// -// TTPK v1: -// -// 0 63 -// | TTPB ID | ALL ZEROES | SPAN | X | Y | -// +---------+------------+------+-----+-----+ -// | 27 | 1 | 12 | 12 | 12 | -// -// -// TTPK v2: -// -// 0 63 -// | TTPB ID | PREFIX | SPAN | X | Y | -// +---------+--------+------+-----+-----+ -// | 27 | 1 (=1) | 12 | 12 | 12 | -// - -#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1) - -// -// smem accumulator -// - -union skc_subgroup_accum -{ - struct { - SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT]; - } atomic; - - struct { - skc_ttp_t ttp[SKC_TILE_HEIGHT]; - } aN; - - struct { - SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE]; - } vN; - - struct { - SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH]; - } zero; -}; - -// -// -// - -struct skc_subgroup_smem -{ - // prefix accumulator - union skc_subgroup_accum accum; -}; - -// -// -// - -static -skc_uint -skc_subgroup_lane() -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - return get_sub_group_local_id(); -#else - return 0; -#endif -} - -// -// -// - -static -SKC_PREFIX_TTS_V_BITFIELD -skc_tts_get_dy(skc_tts_v_t const ttsv) -{ - // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32] - SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY; - - return dy - (~ttsv >> 31); -} - -static -SKC_PREFIX_TTS_V_BITFIELD -skc_tts_get_py(skc_tts_v_t const ttsv) -{ - return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2); -} - -// -// -// - -static -void -skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v) -{ - // get "altitude" - SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v); - - // get the y pixel coordinate - SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v); - - // - // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid? - // - // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op - // - -#if 0 - if (tts_v != SKC_TTS_INVALID) - printf("< %08X = %u : %d >\n",tts_v,py,dy); -#endif - - // - // scatter-add the "altitude" to accumulator - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \ - } - -#else - // - // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS - // - // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C == SKC_TTS_INVALID) \ - return; \ - smem->accum.aN.ttp[py C] = dy C; -#endif - - SKC_PREFIX_TTS_VECTOR_INT_EXPAND(); -} - -// -// The implication here is that if our device configuration has a -// rectangular 1:2 tile then we need a block size of at least 2 -// subblocks. The subblock size of course needs to match the length of -// the smallest tile side. -// - -static -void -skc_accum_flush(__local struct skc_subgroup_smem * const smem, - __global skc_bp_elem_t * const bp_elems, - skc_block_id_t const pb_id) -{ - // load the ttp elements - SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()]; - skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); - -#if ( SKC_TILE_RATIO == 1 ) - - bp_elems[offset] = ttp_v; - -#elif ( SKC_TILE_RATIO == 2 ) - - vstore2(ttp_v,offset,bp_elems); - -#else - -#error("tile ratio greater than 2 not supported") - -#endif -} - -// -// -// - -static -void -skc_accum_reset(__local struct skc_subgroup_smem * const smem) -{ - for (uint ii=0; iiaccum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 ); -} - -// -// get next sk key -// - -static -skc_ttsk_s_t -skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v, - skc_uint * const sk_next, - skc_int * const rkpk_rem) -{ - // decrement count - *rkpk_rem -= 1; - -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT with subgroup support is easy - // - // SIMT without subgroup support can always emulate with smem - // -#if 0 - // - // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly - // broadcast a uint2 cast to a long. It was probably bad to do this - // anyway without a union wrapping the TTSK scalar type. - // - // Consider creating a union { ulong; uint2 } at a later date -- - // probably no need to ever do this unless it makes broadcast faster - // which is unlikely since it will probably be implemented as 2 - // 32-bit broadcasts. - // - // Additionally, the TTRK and TTXK key bitfield sizes are probably - // cast in stone and we aren't going to change them no matter - // architecture we're on. - // - skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++); -#else - skc_ttsk_s_t sk_s; - - sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next); - sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next); - *sk_next += 1; -#endif - -#else - // - // SIMD will always grab component .s0 and then rotate the vector - // - sk_s = ( sk_v->s0 ); - - skc_ttsk_v_rotate_down(sk_v); - -#endif - - return sk_s; -} - -// -// -// - -static -skc_raster_yx_s -skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next) -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT with subgroup support is easy - // - // SIMT without subgroup support can always emulate with smem - // - skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next); - -#else - // - // SIMD will always grab component .s0 and then rotate the vector - // - skc_raster_yx_s const yx_s = ( sk_v->s0.hi ); - -#endif - - return yx_s; -} - -// -// mask off ttsb id -// - -static -skc_block_id_s_t -skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s) -{ - return ( sk_s->lo & SKC_TTXK_LO_MASK_ID ); -} - -// -// load tts_v as early as possible -// - -static -skc_tts_v_t -skc_load_tts(__global skc_bp_elem_t * const bp_elems, - skc_block_id_s_t const sb_id) -{ - return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] ); -} - -// -// massage ttrk keys into ttsk keys -// - -static -void -skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v) -{ - sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits - sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits -} - -// -// replenish ttsk keys -// - -static -void -skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v, - skc_uint * const sk_next, - skc_uint * const rks_next, - __global skc_ttrk_e_t const * const rks) -{ - // if there are still keys available then return - if (*sk_next < SKC_PREFIX_TTXK_V_SIZE) - return; - - // - // otherwise, replenish sk_v - // - // NOTE NOTE NOTE -- we are assuming rks[] extent size is always - // divisible by TTXK_V_SIZE and therefore loading some keys from the - // next raster is OK. - // - *sk_next = 0; - *rks_next += SKC_PREFIX_SUBGROUP_SIZE; - *sk_v = rks[*rks_next]; - -#if 0 - printf("* %08X ( %3u, %3u )\n", - sk_v->hi, - (sk_v->hi >> 12) & 0xFFF, - (sk_v->hi ) & 0xFFF); -#endif - - skc_ttrk_to_ttsk(sk_v); - -#if 0 - printf("! %08X ( %3u, %3u )\n", - sk_v->hi, - (sk_v->hi >> 20) & 0xFFF, - (sk_v->hi >> 8) & 0xFFF); -#endif -} - -// -// replenish block ids -// -// note that you can't overrun the block id pool since it's a ring -// - -static -void -skc_blocks_replenish(skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) - -{ - *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE; - *blocks = bp_ids[*blocks_idx & bp_mask]; - *blocks_next = 0; - -#if 0 - printf("replenish blocks: %u\n",*blocks); -#endif -} - -// -// -// - -static -skc_block_id_t -skc_blocks_get_next(skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // replenish? - if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE) - { - skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - } - -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); - -#else - // - // SIMD - // - skc_block_id_t id = blocks->s0; - - skc_shuffle_down_1(*blocks); - -#endif - - *blocks_next += 1; - - return id; -} - -// -// subblock allocator -// - -#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) - -static -skc_block_id_t -skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks, - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - } - - skc_block_id_t const pb_id = *subblocks; - - *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks - - return pb_id; -} - -#endif - -// -// append a ttsk key to the work-in-progress node -// - -static -void -skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s, - - skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - skc_uint * const xk_v_idx, - __global skc_bp_elem_t * const bp_elems, - - skc_int const rkpk_rem, - - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, - __global skc_block_id_t const * const bp_ids) -{ - // - // Append an sk key to the in-register xk_v vector - // - // If the work-in-progress node in gmem will only have room for one - // more key then: - // - // - if this was the final SK then write out xk_v and exit - // - // - otherwise, acquire a block id, link it, write out xk_v, - // prepare new node - // - // Note that this does *not* try to squeeze in a final key into the - // next node slot. This optimization isn't worth the added - // down-pipeline complexity. - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) - { - *xk_v = *sk_s; - } - - *xk_v_next += 1; - - // are there more keys coming? - if (rkpk_rem > 0) - { - // is the node almost full? - if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) - { - skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - - if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) - { - xk_v->lo = id; - xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary - } - - // store xk_v (uint2) to bp (uint) - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // update node elem idx - *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // reset node count - *xk_v_next = 0; - } - // is xk_v full? - else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) - { - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // increment node elem idx - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - } - } - else - { - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) - { - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - - bp_elems[*xk_v_idx] = SKC_UINT_MAX; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; - } - } - -#else - // - // SIMD - // - -#endif -} - -// -// -// - -static -skc_ttpk_s_t -skc_ttpk_create(skc_raster_yx_s const yx_prev, - skc_raster_yx_s const yx_next, - skc_block_id_t const pb_id) -{ - // - yx_prev is already incremented by one - // - yx_span is already shifted up at hi.x - skc_uint const yx_span = yx_next - yx_prev; - - skc_ttpk_s_t pk; - - // turn on prefix bit | shift span bits upward - pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN); - - // shift down high span bits | yx of tile - pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("* %08v2X : %u\n",pk,yx_span); -#endif - - return pk; -} - -// -// append a ttpk key to the work-in-progress node -// - -static -void -skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s, - - skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - skc_uint * const xk_v_idx, - __global skc_bp_elem_t * const bp_elems, - - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, - __global skc_block_id_t const * const bp_ids) -{ - // - // append a pk key to the in-register xk_v vector - // - // if the work-in-progress node in gmem will only have room for one - // more key then: - // - // - if this was the final SK then write out xk_v and exit - // - // - otherwise, acquire a block id, link it, write out xk_v, - // prepare new node - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) - { - *xk_v = *pk_s; - } - - *xk_v_next += 1; - - // is the node almost full? - if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) - { - skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - - if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) - { - xk_v->lo = id; - xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary - } - - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // update node elem idx - *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // reset node count - *xk_v_next = 0; - } - // is xk_v full? - else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) - { - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // increment node elem idx - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - } - -#else - // - // SIMD - // -#endif -} - -// -// append the first 3 fields of meta info to the raster header -// - -static -void -skc_node_v_init_header(skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - union skc_raster_cohort_meta_out const * const meta) -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() < 2) - { - *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi; - } - -#if 0 - if (get_sub_group_local_id() == 0) - printf("header: %08v4X\n",meta->u32v4); -#endif - - // - // increment counter: uint4 + uint4 = uint2 x 4 - // - *xk_v_next = 2 + 2; // +2 for unitialized bounds - -#else - // - // SIMD - // - -#endif -} - -// -// -// - -__kernel -SKC_PREFIX_KERNEL_ATTRIBS -void -skc_kernel_prefix(__global skc_uint const * const bp_atomics, - __global skc_block_id_t const * const bp_ids, - __global skc_bp_elem_t * const bp_elems, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_ttrk_e_t const * const rks, - __global skc_block_id_t * const map, - __global skc_uint const * const metas, - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem smem[1]; -#else - __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id(); -#endif - - // - // where is this subgroup in the grid? - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const sgi = get_group_id(0); -#else - skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - skc_uint const sgl = get_sub_group_local_id(); - - // - // return if this subgroup is excess - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 ) - if (sgi >= count) - return; -#endif - - // - // get meta info for this subgroup's raster - // - union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) }; - skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("%3u : %5u / %5u / %5u / %5u / %u\n", - sgi, - meta.blocks, - meta.offset, - meta.nodes, - meta.keys, - reads); -#endif - - // - // preload blocks -- align on subgroup - // - skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); - skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask]; - skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK); - - // - // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset - // - skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // - // initialize raster header -- assumes block is greater than 8 words (4 doublewords) - // - skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX }; - skc_uint xk_v_next; - - skc_node_v_init_header(&xk_v,&xk_v_next,&meta); - - // - // no keys -- this is an empty raster! - // - if (meta.keys == 0) - { - bp_elems[xk_v_idx ] = xk_v.lo; - bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi; - - while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) - { - xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - - bp_elems[xk_v_idx] = SKC_UINT_MAX; - bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; - } - - return; - } - - // - // load TTRK keys and in-place convert to TTSK keys - // - skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); - skc_ttsk_v_t sk_v = rks[rks_next]; - skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK); - skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys - -#if 0 - printf("* %08X ( %3u, %3u )\n", - sk_v.hi, - (sk_v.hi >> 12) & 0xFFF, - (sk_v.hi ) & 0xFFF); -#endif - - skc_ttrk_to_ttsk(&sk_v); - -#if 0 - printf("! %08X ( %3u, %3u )\n", - sk_v.hi, - (sk_v.hi >> 20) & 0xFFF, - (sk_v.hi >> 8) & 0xFFF); -#endif - - // - // subblocks - // -#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) - skc_block_id_t subblocks = 0; -#endif - - // - // begin "scan" of tiles - // - skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next); - - // - // zero the accumulator - // - skc_accum_reset(smem); - - while (true) - { - // get next rk key - skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem); - - // load ttsb id - skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s); - - // load tts_v transaction "in flight" as early as possible - skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id); - -#if 0 - printf("{ %08X }\n",tts_v); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("[ %d, %X ]\n",rkpk_rem,sb_id); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF); -#endif - - // - // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF - // TIME AND SIMD'IZED - // - - // if yx's don't match then we're either issuing a ttpk or - // resetting the accumulator - if (sk_s.hi != yx_prev) - { - // if yx_next.y == yx_last.y then x changed - if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0) - { - // - // if the tile is not square then it's ratio is 1:2 - // -#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 - skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks, - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); -#else - skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); -#endif - - // flush accumulated ttp vector to block/subblock at ttpb_id - skc_accum_flush(smem,bp_elems,pb_id); - -#if 0 - if (get_sub_group_local_id() == 0) - { - printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n", - pb_id, - (yx_prev >> SKC_TTXK_HI_OFFSET_Y), - (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF, - (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF, - (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF); - } -#endif - - // - // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP - // - rkpk_rem -= 1; - - // create the pk - skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id); - - // append pk key to xk buffer - skc_node_v_append_pk(&pk_s, - - &xk_v, - &xk_v_next, - &xk_v_idx, - bp_elems, - - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); - } - else if (rkpk_rem > 0) // we're starting a new tile row - { - skc_accum_reset(smem); - } - } - - // - // append sk key to node_v - // - // if rkpk_rem is zero then return from kernel - // - skc_node_v_append_sk(&sk_s, - - &xk_v, - &xk_v_next, - &xk_v_idx, - bp_elems, - - rkpk_rem, - - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); - - // we're done if no more sk keys - if (rkpk_rem == 0) - break; - - // move to new tile - yx_prev = sk_s.hi; - - // scatter tts values into accumulator - skc_accum_scatter(smem,tts_v); - - // replenish sk keys - skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks); - } -} - -// -// -// diff --git a/src/compute/skc/raster_builder_cl_12.c b/src/compute/skc/raster_builder_cl_12.c deleted file mode 100644 index 33992cbdfb..0000000000 --- a/src/compute/skc/raster_builder_cl_12.c +++ /dev/null @@ -1,1349 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -// get rid of these -#include -#include - -// -// -// - -#include "hs/cl/hs_cl_launcher.h" - -#include "common/cl/assert_cl.h" - -#include "context.h" -#include "grid.h" -#include "raster.h" -#include "extent_ring.h" -#include "raster_builder.h" - -#include "tile.h" - -#include "config_cl.h" -#include "runtime_cl_12.h" -#include "extent_cl_12.h" -#include "raster_builder_cl_12.h" - -// -// RASTERIZATION SUB-PIPELINE -// -------------------------- -// -// Phase 1: expand commands -// -// Phase 2: rasterize -// -// Phase 3: sort & segment || release paths -// -// Phase 4: prefix -// -// Phase 5: release rasters -// -// RASTER COHORT -// ============== -// -// BUILDER RASTERIZER POST PROCESSING -// <-----------------------------------------------> <------------> <---------------------------------------------------------------------> -// -// fill cmds transforms raster clips path release rasterize cmds cohort map raster release TTSB TTSK cohort atomics context atomics -// --------- ---------- ------------ ------------ -------------- ---------- -------------- ---- ---- -------------- --------------- -// 1,2 1,2 1,2 1,2 2 1-4 1,2,3,4 2-4 2-4 2-4 global -// -// -// NOTES: FINE-GRAINED SVM -// ----------------------- -// -// 1) In a fine-grained system we know the exact number of -// rasterize cmds per segment type before phase 1 -// -// 2) A raster that's "under construction" shouldn't be rasterized -// until it is complete. This implies that a raster is not part -// of a cohort until it is complete. The raster builder must -// handle raster promises being "forced" to completion -- this is -// likely the result of composition construction and subsequent -// rendering to a surface. -// -// 3) The raster cohort rasterizer state retains the fill cmd, -// transform, raster clip and path release "ring" extents. -// -// 4) The rasterize cmd extent sizes (line, quad, cubic, rational -// quad, rational cubic) are known ahead of time. -// -// 5) The raster cohort post processor is standalone and retains the -// raster_map, cohort atomics, TTSK_RYX extent, and raster -// references until complete. -// - -// -// Notes: -// -// - Could have a pipeline stage before expansion count the exact -// number of line/quad/cubic commands but the command buffers are -// relatively small (64-bit commands * # of path segments). -// - -// raster -// cohort atomics path_ids raster_ids transforms clips cmds_fill cmds_l/q/c ttsk_ryx -// -// -// BEGIN ^ -// | -// EXPAND | -// | -// RASTERIZE | -// | -// SORT || RELEASE PATHS | -// | -// PREFIX | -// | -// RELEASE RASTERS | -// | -// END v -// -// -// BEGIN -// -// EXPAND -- PRODUCES: one or more extents of rasterization commands -// -// RASTERIZE -- DEPENDENCY: requires size of command extents before launching -// -- PRODUCES: an extent of ttsk_ryx keys -// -// SORT || RELEASE PATHS -- DEPENDENCY: requires size of key extent before launching -// -- PRODUCES: sorted array of keys -// -// PREFIX -- DEPENDENCY: none -- can execute after SORT because grid size is number of rasters -// -// RELEASE RASTERS -- DEPENDENCY: none -- can execute after prefix -// -// END -// - -// ------------------------ -// -// DEPENDENCY is cleanly implemented with a host callback or device kernel launcher -// -// Can this hide resource acquisition? Yes. But there are two cases: -// -// 1. acqusition of resources occurs on the host thread and lack of -// resources drains the host command queue until resources are -// available (OpenCL 2.x) -// -// 2. the host commands lazily acquire resources (OpenCL 1.2) -// -// ------------------------ -// -// How to express? -// -// Each substage launches its successors. This supports both dependency models. -// -// If OpenCL 1.2 then the substage can't be launched until the prior -// stage's event is complete. So this requires registering a callback -// to invoke the substage. -// -// ------------------------ - -// -// BUILD -// - -struct skc_raster_builder_impl -{ - struct skc_raster_builder * raster_builder; - struct skc_runtime * runtime; - - skc_grid_t cohort; - - // these are all durable/perm extents - struct skc_extent_phrwg_thr1s path_ids; // read/write by host - struct skc_extent_phw1g_tdrNs transforms; // write once by host + read by device - struct skc_extent_phw1g_tdrNs clips; // write once by host + read by device - struct skc_extent_phw1g_tdrNs fill_cmds; // write once by host + read by device - struct skc_extent_phrwg_tdrNs raster_ids; // read/write by host + read by device - - struct { - cl_kernel fills_expand; - cl_kernel rasterize_all; - cl_kernel segment; - cl_kernel rasters_alloc; - cl_kernel prefix; - } kernels; -}; - -// -// RASTER COHORT -// -// This sub-pipeline snapshots the raster builder and then acquires -// and releases host and device resources as necessary (as late as -// possible). -// -// Note that the cohort extents are ephemeral and are only used by one -// or more stages of a the rasterization sub-pipeline. -// -// The pipeline implementation may vary between compute platforms. -// - -struct skc_raster_cohort -{ - struct skc_raster_builder_impl * impl; - - struct skc_extent_phrwg_thr1s_snap path_ids; // read/write by host - struct skc_extent_phw1g_tdrNs_snap transforms; // write once by host + read by device - struct skc_extent_phw1g_tdrNs_snap clips; // write once by host + read by device - struct skc_extent_phw1g_tdrNs_snap fill_cmds; // write once by host + read by device - struct skc_extent_phrwg_tdrNs_snap raster_ids; // read/write by host + read by device - - cl_command_queue cq; - - // sub-pipeline atomics - struct skc_extent_thr_tdrw atomics; - - // path primitives are expanded into line/quad/cubic/rational cmds - struct skc_extent_tdrw cmds; - - // rasterization output - struct skc_extent_tdrw keys; - // struct skc_extent_thrw_tdrw keys; - - // post-sort extent with metadata for each raster - struct skc_extent_tdrw metas; - // struct skc_extent_thrw_tdrw metas; - - // subbuf id - skc_subbuf_id_t id; - - // - // pipeline also uses the following global resources: - // - // - command queue from global factory - // - global block pool and its atomics - // - global path and raster host id map - // - temporary host and device allocations - // -}; - -// -// TTRK (64-BIT COMPARE) -// -// 0 63 -// | TTSB ID | X | Y | COHORT ID | -// +---------+------+------+-----------+ -// | 27 | 12 | 12 | 13 | -// -// -// TTRK (32-BIT COMPARE) -// -// 0 63 -// | TTSB ID | N/A | X | Y | COHORT ID | -// +---------+-----+------+------+-----------+ -// | 27 | 5 | 12 | 12 | 8 | -// - -// -// TTRK is sortable intermediate key format for TTSK -// -// We're going to use the 32-bit comparison version for now -// - -union skc_ttrk -{ - skc_ulong u64; - skc_uint2 u32v2; - - struct { - skc_uint block : SKC_TTXK_LO_BITS_ID; - skc_uint na0 : SKC_TTRK_LO_BITS_NA; - skc_uint x : SKC_TTXK_HI_BITS_X; - skc_uint y : SKC_TTXK_HI_BITS_Y; - skc_uint cohort : SKC_TTRK_HI_BITS_COHORT; - }; - - struct { - skc_uint na1; - skc_uint yx : SKC_TTXK_HI_BITS_YX; - skc_uint na2 : SKC_TTRK_HI_BITS_COHORT; - }; - - struct { - skc_uint na3; - skc_uint na4 : SKC_TTXK_HI_BITS_X; - skc_uint cohort_y : SKC_TTRK_HI_BITS_COHORT_Y; - }; -}; - -// -// -// - -static -void -skc_raster_builder_pfn_release(struct skc_raster_builder_impl * const impl) -{ - // decrement reference count - if (--impl->raster_builder->refcount != 0) - return; - - // - // otherwise, dispose of the the raster builder and its impl - // - struct skc_runtime * const runtime = impl->runtime; - - // free the raster builder - skc_runtime_host_perm_free(runtime,impl->raster_builder); - - // free durable/perm extents - skc_extent_phrwg_thr1s_free(runtime,&impl->path_ids); - skc_extent_phw1g_tdrNs_free(runtime,&impl->transforms); - skc_extent_phw1g_tdrNs_free(runtime,&impl->clips); - skc_extent_phw1g_tdrNs_free(runtime,&impl->fill_cmds); - skc_extent_phrwg_tdrNs_free(runtime,&impl->raster_ids); - - // release kernels - cl(ReleaseKernel(impl->kernels.fills_expand)); - cl(ReleaseKernel(impl->kernels.rasterize_all)); - -#if 0 - cl(ReleaseKernel(impl->kernels.rasterize_lines)); - cl(ReleaseKernel(impl->kernels.rasterize_quads)); - cl(ReleaseKernel(impl->kernels.rasterize_cubics)); -#endif - - cl(ReleaseKernel(impl->kernels.segment)); - cl(ReleaseKernel(impl->kernels.rasters_alloc)); - cl(ReleaseKernel(impl->kernels.prefix)); - - // free the impl - skc_runtime_host_perm_free(runtime,impl); -} - -// -// -// - -static -void -skc_raster_builder_rasters_release(struct skc_runtime * const runtime, - skc_raster_t const * const rasters, - skc_uint const size, - skc_uint const from, - skc_uint const to) -{ - if (from <= to) // no wrap - { - skc_raster_t const * rasters_from = rasters + from; - skc_uint count_from = to - from; - - skc_grid_deps_unmap(runtime->deps,rasters_from,count_from); - skc_runtime_raster_device_release(runtime,rasters_from,count_from); - } - else // from > to implies wrap - { - skc_raster_t const * rasters_lo = rasters + from; - skc_uint count_lo = size - from; - - skc_grid_deps_unmap(runtime->deps,rasters_lo,count_lo); - skc_runtime_raster_device_release(runtime,rasters_lo,count_lo); - - skc_grid_deps_unmap(runtime->deps,rasters,to); - skc_runtime_raster_device_release(runtime,rasters,to); - } -} - -static -void -skc_raster_builder_paths_release(struct skc_runtime * const runtime, - struct skc_extent_phrwg_thr1s_snap * const snap) -{ - // release lo - skc_runtime_path_device_release(runtime,snap->hr1.lo,snap->count.lo); - - // release hi - if (snap->count.hi) - skc_runtime_path_device_release(runtime,snap->hr1.hi,snap->count.hi); -} - -static -void -skc_raster_builder_cohort_grid_pfn_dispose(skc_grid_t const grid) -{ - // - // ALLOCATED RESOURCES - // - // path_ids - - // raster_ids a - // transforms - - // clips - - // fill_cmds - - // cq a - // cohort atomics a - // cmds - - // keys a - // meta a - // - - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - struct skc_raster_builder_impl * const impl = cohort->impl; - struct skc_runtime * const runtime = impl->runtime; - - // - // release paths -- FIXME -- Note that releasing paths can be - // performed after rasterization is complete - // - - // snap alloc the paths -- this host snap simply sets up pointers - skc_extent_phrwg_thr1s_snap_alloc(runtime,&impl->path_ids,&cohort->path_ids); - - // unmap and release raster ids - skc_raster_builder_paths_release(runtime,&cohort->path_ids); - - // release path ids - skc_extent_phrwg_thr1s_snap_free(runtime,&cohort->path_ids); - - // - // release rasters - // - skc_uint const size = cohort->raster_ids.snap->ring->size.pow2; - skc_uint const from = skc_extent_ring_snap_from(cohort->raster_ids.snap); - skc_uint const to = skc_extent_ring_snap_to(cohort->raster_ids.snap); - - // unmap and release raster ids - skc_raster_builder_rasters_release(runtime,impl->raster_ids.hrw,size,from,to); - - // release cohort's remaining allocated resources - skc_extent_phrwg_tdrNs_snap_free(runtime,&cohort->raster_ids); - skc_runtime_release_cq_in_order(runtime,cohort->cq); - skc_extent_thr_tdrw_free(runtime,&cohort->atomics); - skc_extent_tdrw_free(runtime,&cohort->keys); - skc_extent_tdrw_free(runtime,&cohort->metas); - // skc_extent_thrw_tdrw_free(runtime,&cohort->keys); - // skc_extent_thrw_tdrw_free(runtime,&cohort->metas); - skc_runtime_host_temp_free(runtime,cohort,cohort->id); - - // release the raster builder - skc_raster_builder_pfn_release(impl); - - // - // ALLOCATED RESOURCES - // - // path_ids - - // raster_ids - - // transforms - - // clips - - // fill_cmds - - // cq - - // cohort atomics - - // cmds - - // keys - - // meta - - // -} - -// -// -// - -static -void -skc_raster_cohort_prefix_release(skc_grid_t const grid) -{ - // FIXME -- note that pfn_dispose can be accomplished here - - // release the grid - skc_grid_complete(grid); -} - -static -void -skc_raster_cohort_prefix_cb(cl_event event, cl_int status, skc_grid_t const grid) -{ - SKC_CL_CB(status); - - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - struct skc_scheduler * const scheduler = cohort->impl->runtime->scheduler; - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(scheduler,skc_raster_cohort_prefix_release,grid); -} - -// -// -// - -#if 0 -static -int cmp64(const void * ptr_a, const void * ptr_b) -{ - skc_ulong const a = *(const skc_ulong *)ptr_a; - skc_ulong const b = *(const skc_ulong *)ptr_b; - - if (a < b) return -1; - if (a > b) return +1; - else return 0; -} -#endif - -// -// -// - -static -void -skc_raster_cohort_sort_prefix(skc_grid_t const grid) -{ - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms a - // clips a - // fill_cmds - - // cq a - // cohort atomics a - // cmds a - // keys a - // meta - - // - - // use the backpointers - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - struct skc_raster_builder_impl * const impl = cohort->impl; - struct skc_runtime * const runtime = impl->runtime; - - // release transforms - skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->transforms); - - // release clips - skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->clips); - - // release expanded cmds - skc_extent_tdrw_free(runtime,&cohort->cmds); - - // alloc the snapshost -- could be zero-sized - skc_extent_phrwg_tdrNs_snap_alloc(runtime, - &impl->raster_ids, - &cohort->raster_ids, - cohort->cq,NULL); - - // will never be zero - skc_uint const rasters = skc_extent_ring_snap_count(cohort->raster_ids.snap); - - // acquire fixed-size device-side extent - skc_extent_tdrw_alloc(runtime, - &cohort->metas, - sizeof(struct skc_raster_cohort_meta)); - - // skc_extent_thrw_tdrw_alloc(runtime, - // &cohort->metas, - // sizeof(struct skc_raster_cohort_meta)); - - // zero the metas - skc_extent_tdrw_zero(&cohort->metas,cohort->cq,NULL); - - // get the read-only host copy of the device atomics - struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; - - // - // SORT - // - if (atomics->keys > 0) - { -#ifndef NDEBUG - fprintf(stderr,"raster cohort sort: %u\n",atomics->keys); -#endif - - // - // - // - uint32_t keys_padded_in, keys_padded_out; - - hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); - - hs_sort(cohort->cq, - cohort->keys.drw, - cohort->keys.drw, - atomics->keys, - keys_padded_in, - keys_padded_out, - false); - - cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw))); - cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw))); - -#ifndef NDEBUG - fprintf(stderr,"post-sort\n"); -#endif - - // find start of each tile - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK, - cohort->cq, - impl->kernels.segment, - atomics->keys, - 0,NULL,NULL); - -#ifndef NDEBUG - fprintf(stderr,"post-segment\n"); -#endif - - // - // DELETE ALL THIS WHEN READY - // - -#if 0 - // - // - // - cl(Finish(cohort->cq)); - - // map keys to host - union skc_ttrk * const keys = skc_extent_thrw_tdrw_map(&cohort->keys, - cohort->cq, - NULL); - // map meta to host - struct skc_raster_cohort_meta * const metas = skc_extent_thrw_tdrw_map(&cohort->metas, - cohort->cq, - NULL); - // block until done - cl(Finish(cohort->cq)); - - // sort keys - qsort(keys,atomics->keys,sizeof(*keys),cmp64); - - // mask to determine if rk id is a new block - skc_uint const subblock_mask = runtime->config->block.subblocks - 1; - - // - // some counters - // - union skc_raster_cohort_meta_in meta_in = { - .blocks = 0, - .offset = 0, - .pk = 0, - .rk = 0 - }; - - // get first key - union skc_ttrk curr = keys[0]; - - skc_uint ii=0, jj=0; - - // for all TTRK keys - while (true) - { - // increment ttrk count - meta_in.rk += 1; - - // was this a new block? - if ((curr.u32v2.lo & subblock_mask) == 0) - meta_in.blocks += 1; - - // break if we're out of keys - if (++ii >= atomics->keys) - break; - - // otherwise, process next key - union skc_ttrk const next = keys[ii]; - - // if new cohort then save curr meta and init next meta - if (next.cohort != curr.cohort) - { - fprintf(stderr,"[ %u, %u, %u, %u ]\n", - meta_in.blocks, - meta_in.offset, - meta_in.pk, - meta_in.rk); - - // store back to buffer - metas->inout[curr.cohort].in = meta_in; - - // update meta_in - meta_in.blocks = 0; - meta_in.offset = ii; - meta_in.pk = 0; - meta_in.rk = 0; - } - // otherwise, if same y but new x then increment TTPK count - else if ((next.y == curr.y) && (next.x != curr.x)) - { - meta_in.pk += 1; - -#if 0 - fprintf(stderr,"%3u : %3u : ( %3u, %3u ) -> ( %3u )\n", - jj++,curr.cohort,curr.y,curr.x,next.x); -#endif - } - -#if 0 - fprintf(stderr,"( %3u, %3u )\n",next.y,next.x); -#endif - - curr = next; - } - - fprintf(stderr,"[ %u, %u, %u, %u ]\n", - meta_in.blocks, - meta_in.offset, - meta_in.pk, - meta_in.rk); - - // store back to buffer - metas->inout[curr.cohort].in = meta_in; - - - // unmap - skc_extent_thrw_tdrw_unmap(&cohort->keys, - keys, - cohort->cq, - NULL); - - // unmap - skc_extent_thrw_tdrw_unmap(&cohort->metas, - metas, - cohort->cq, - NULL); -#endif - } - -#ifndef NDEBUG - fprintf(stderr,"rasters_alloc: %u\n",rasters); -#endif - - // - // RASTER ALLOC/INIT - // - cl(SetKernelArg(impl->kernels.rasters_alloc,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); - cl(SetKernelArg(impl->kernels.rasters_alloc,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); - cl(SetKernelArg(impl->kernels.rasters_alloc,2,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); - cl(SetKernelArg(impl->kernels.rasters_alloc,3,SKC_CL_ARG(runtime->handle_pool.map.drw))); - cl(SetKernelArg(impl->kernels.rasters_alloc,4,SKC_CL_ARG(cohort->metas.drw))); - cl(SetKernelArg(impl->kernels.rasters_alloc,5,SKC_CL_ARG(cohort->raster_ids.drN))); - cl(SetKernelArg(impl->kernels.rasters_alloc,6,SKC_CL_ARG(rasters))); - - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC, - cohort->cq, - impl->kernels.rasters_alloc, - rasters, - 0,NULL,NULL); - -#ifndef NDEBUG - fprintf(stderr,"post-alloc\n"); -#endif - - // - // PREFIX - // - cl(SetKernelArg(impl->kernels.prefix,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); - cl(SetKernelArg(impl->kernels.prefix,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); - cl(SetKernelArg(impl->kernels.prefix,2,SKC_CL_ARG(runtime->block_pool.blocks.drw))); - cl(SetKernelArg(impl->kernels.prefix,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); - - cl(SetKernelArg(impl->kernels.prefix,4,SKC_CL_ARG(cohort->keys.drw))); - cl(SetKernelArg(impl->kernels.prefix,5,SKC_CL_ARG(runtime->handle_pool.map.drw))); - - cl(SetKernelArg(impl->kernels.prefix,6,SKC_CL_ARG(cohort->metas.drw))); - cl(SetKernelArg(impl->kernels.prefix,7,SKC_CL_ARG(rasters))); - - cl_event complete; - - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_PREFIX, - cohort->cq, - impl->kernels.prefix, - rasters, - 0,NULL, - &complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_prefix_cb,grid)); - cl(ReleaseEvent(complete)); - -#ifndef NDEBUG - fprintf(stderr,"post-prefix\n"); -#endif - - // flush command queue - cl(Flush(cohort->cq)); - - // - // ALLOCATED RESOURCES - // - // path_ids a - // raster_ids a - // transforms - - // clips - - // fill_cmds - - // cq a - // cohort atomics a - // cmds - - // keys a - // meta a - // -} - -static -void -skc_raster_cohort_rasterize_cb(cl_event event, cl_int status, skc_grid_t const grid) -{ - SKC_CL_CB(status); - - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_sort_prefix,grid); -} - -static -void -skc_raster_cohort_rasterize(skc_grid_t const grid) -{ - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms i - // clips i - // fill_cmds s - // cq a - // cohort atomics a - // cmds a - // cmds_quad a - // cmds_cubic a - // keys - - // meta - - - // use the backpointers - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - struct skc_raster_builder_impl * const impl = cohort->impl; - struct skc_runtime * const runtime = impl->runtime; - - // - // RELEASED RESOURCES - // - // cmds snap - // - - // release the cmds extent and snap since it's only used by the expand stage - skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->fill_cmds); - - // - // NEW ALLOCATED RESOURCES - // - // transforms snap - // clips snap - // ttrk keys - // - skc_extent_phw1g_tdrNs_snap_alloc(runtime, - &impl->transforms, - &cohort->transforms, - cohort->cq,NULL); - - skc_extent_phw1g_tdrNs_snap_alloc(runtime, - &impl->clips, - &cohort->clips, - cohort->cq,NULL); - - // acquire device-side extent - skc_extent_tdrw_alloc(runtime, - &cohort->keys, - sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); - - // skc_extent_thrw_tdrw_alloc(runtime, - // &cohort->keys, - // sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); - - // - // acquire out-of-order command queue - // - // and launch up to 3 kernels - // - // for each kernel: - // - // set runtime "global" kernel args: - // - // - block pool atomics - // - block pool extent - // - // set cohort "local" kernel args: - // - // - atomics - // - cmds - // - // enqueue barrier - // enqueue copy back of atomics on the command queue - // set callback on copy back event - // release command queue - // - struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; - - if (atomics->cmds > 0) - { - cl(SetKernelArg(impl->kernels.rasterize_all,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); - cl(SetKernelArg(impl->kernels.rasterize_all,1,SKC_CL_ARG(runtime->block_pool.blocks.drw))); - cl(SetKernelArg(impl->kernels.rasterize_all,2,SKC_CL_ARG(runtime->block_pool.ids.drw))); - cl(SetKernelArg(impl->kernels.rasterize_all,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); - - cl(SetKernelArg(impl->kernels.rasterize_all,4,SKC_CL_ARG(cohort->atomics.drw))); - cl(SetKernelArg(impl->kernels.rasterize_all,5,SKC_CL_ARG(cohort->keys.drw))); - - cl(SetKernelArg(impl->kernels.rasterize_all,6,SKC_CL_ARG(cohort->transforms.drN))); - cl(SetKernelArg(impl->kernels.rasterize_all,7,SKC_CL_ARG(cohort->clips.drN))); - cl(SetKernelArg(impl->kernels.rasterize_all,8,SKC_CL_ARG(cohort->cmds.drw))); - cl(SetKernelArg(impl->kernels.rasterize_all,9,SKC_CL_ARG(atomics->cmds))); - - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL, - cohort->cq, - impl->kernels.rasterize_all, - atomics->cmds, - 0,NULL,NULL); - } - - // - // copyback number of TTSK keys - // - cl_event complete; - - skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_rasterize_cb,grid)); - cl(ReleaseEvent(complete)); - - // flush command queue - cl(Flush(cohort->cq)); - - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms a - // clips a - // fill_cmds - - // cq a - // cohort atomics a - // cmds a - // keys a - // meta - -} - -static -void -skc_raster_cohort_fills_expand_cb(cl_event event, cl_int status, skc_grid_t const grid) -{ - SKC_CL_CB(status); - - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_rasterize,grid); -} - -static -void -skc_raster_builder_cohort_grid_pfn_execute(skc_grid_t const grid) -{ - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms i - // clips i - // fill_cmds i - // cq - - // cohort atomics - - // cmds - - // keys - - // meta - - // - - // allocate the cohort - struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); - - // get impl - struct skc_raster_builder_impl * const impl = cohort->impl; - struct skc_runtime * const runtime = impl->runtime; - - // acquire in-order cq - cohort->cq = skc_runtime_acquire_cq_in_order(runtime); - - // alloc the snapshot -- could be zero-sized - skc_extent_phw1g_tdrNs_snap_alloc(runtime, - &impl->fill_cmds, - &cohort->fill_cmds, - cohort->cq,NULL); - - // flush the cq to get the fill running - // cl(Flush(cohort->cq)); - - // create split atomics - skc_extent_thr_tdrw_alloc(runtime,&cohort->atomics,sizeof(struct skc_raster_cohort_atomic)); - - // zero the atomics - skc_extent_thr_tdrw_zero(&cohort->atomics,cohort->cq,NULL); - - // get config - struct skc_config const * const config = runtime->config; - - // acquire device-side extents - skc_extent_tdrw_alloc(runtime, - &cohort->cmds, - sizeof(union skc_cmd_rasterize) * config->raster_cohort.expand.cmds); - - // - // FILLS EXPAND - // - // need result of cmd counts before launching RASTERIZE grids - // - // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host - // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device - // - or launch a device-wide grid that feeds itself but that's unsatisfying - // - - // how many commands? could be zero - skc_uint const work_size = skc_extent_ring_snap_count(cohort->fill_cmds.snap); - - if (work_size > 0) - { - cl(SetKernelArg(impl->kernels.fills_expand,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); - cl(SetKernelArg(impl->kernels.fills_expand,1,SKC_CL_ARG(cohort->atomics.drw))); - cl(SetKernelArg(impl->kernels.fills_expand,2,SKC_CL_ARG(runtime->handle_pool.map.drw))); - cl(SetKernelArg(impl->kernels.fills_expand,3,SKC_CL_ARG(cohort->fill_cmds.drN))); - cl(SetKernelArg(impl->kernels.fills_expand,4,SKC_CL_ARG(cohort->cmds.drw))); - - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_FILLS_EXPAND, - cohort->cq, - impl->kernels.fills_expand, - work_size, - 0,NULL,NULL); - } - - // - // copyback number of rasterization commands - // - cl_event complete; - - skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_fills_expand_cb,grid)); - cl(ReleaseEvent(complete)); - - // flush command queue - cl(Flush(cohort->cq)); - - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms i - // clips i - // fill_cmds s - // cq a - // cohort atomics a - // cmds a - // keys - - // meta - - // -} - -// -// move grid into waiting state -// -// this entails allocating a cohort from the temporary extent -// - -static -void -skc_raster_builder_cohort_grid_pfn_waiting(skc_grid_t const grid) -{ - // get the impl - struct skc_raster_builder_impl * const impl = skc_grid_get_data(grid); - struct skc_runtime * const runtime = impl->runtime; - - // retain the raster builder - impl->raster_builder->refcount += 1; - - // allocate the ephemeral/temp cohort - skc_subbuf_id_t id; - - struct skc_raster_cohort * const cohort = - skc_runtime_host_temp_alloc(runtime, - SKC_MEM_FLAGS_READ_WRITE, - sizeof(*cohort), - &id, - NULL); - - // save the id and backpointer - cohort->id = id; - cohort->impl = impl; - - // set grid data -- replaces impl - skc_grid_set_data(grid,cohort); - - // - // ACQUIRE RESOURCES FOR THE COHORT - // - - struct skc_raster_builder * const raster_builder = impl->raster_builder; - - // immediately take snapshots of all rings -- these are very inexpensive operations - skc_extent_phrwg_thr1s_snap_init(runtime,&raster_builder->path_ids .ring,&cohort->path_ids); - skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->transforms.ring,&cohort->transforms); - skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->clips .ring,&cohort->clips); - skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->fill_cmds .ring,&cohort->fill_cmds); - skc_extent_phrwg_tdrNs_snap_init(runtime,&raster_builder->raster_ids.ring,&cohort->raster_ids); - - // - // ALLOCATED RESOURCES - // - // path_ids i - // raster_ids i - // transforms i - // clips i - // fill_cmds i - // cq - - // cohort atomics - - // cmds - - // keys - - // meta - - // -} - -// -// -// - -static -void -skc_raster_builder_cohort_create(struct skc_raster_builder_impl * const impl) -{ - // attach a grid - impl->cohort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, - &impl->cohort, - impl, - skc_raster_builder_cohort_grid_pfn_waiting, - skc_raster_builder_cohort_grid_pfn_execute, - skc_raster_builder_cohort_grid_pfn_dispose); -} - -// -// -// - -static -skc_err -skc_raster_builder_pfn_add(struct skc_raster_builder_impl * const impl, - skc_path_t const * paths, - skc_uint count) -{ - // validate and retain the path - skc_err err; - - err = skc_runtime_handle_device_validate_retain(impl->runtime, - SKC_TYPED_HANDLE_TYPE_IS_PATH, - paths, - count); - - if (err) - return err; - - skc_runtime_handle_device_retain(impl->runtime,paths,count); - - // make sure there is a grid - if (impl->cohort == NULL) { - skc_raster_builder_cohort_create(impl); - } - - // declare rasterization grid happens after path - while (count-- > 0) - skc_grid_happens_after_handle(impl->cohort,SKC_TYPED_HANDLE_TO_HANDLE(*paths++)); - - return SKC_ERR_SUCCESS; -} - -// -// -// - -static -void -skc_raster_builder_pfn_end(struct skc_raster_builder_impl * const impl, skc_raster_t * const raster) -{ - // - // acquire host-managed path raster handle and bump reference count - // to 2 handles will be released (reduced to 1) once the rasters are - // completely rasterized - // - *raster = skc_runtime_handle_device_acquire(impl->runtime); - - // make sure there is a grid - if (impl->cohort == NULL) { - skc_raster_builder_cohort_create(impl); - } - - // map a handle to a grid - skc_grid_map(impl->cohort,*raster); -} - -// -// snapshot the ring and lazily start the grid -// -// FIXME -- might want to revisit this and settle on an even more -// opaque implementation. Some options: -// -// - never let the SKC API expose a forced grid start -// - make snapshots kick off a forced grid start -// - be lazy all the time everywhere -// - -static -void -skc_raster_builder_pfn_start(struct skc_raster_builder_impl * const impl) -{ - skc_grid_t const cohort = impl->cohort; - - if (cohort != NULL) { - skc_grid_start(cohort); - } -} - -// -// NOTE: THIS MIGHT BE REMOVED -// - -static -void -skc_raster_builder_pfn_force(struct skc_raster_builder_impl * const impl) -{ - skc_grid_t const cohort = impl->cohort; - - if (cohort != NULL) { - skc_grid_force(cohort); - } -} - -// -// -// - -skc_err -skc_raster_builder_cl_12_create(struct skc_context * const context, - struct skc_raster_builder * * const raster_builder) -{ - struct skc_runtime * const runtime = context->runtime; - - // allocate raster builder - (*raster_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**raster_builder)); - - // refcount - (*raster_builder)->refcount = 1; - - // state - SKC_ASSERT_STATE_INIT((*raster_builder),SKC_RASTER_BUILDER_STATE_READY); - - // allocate runtime raster builder - struct skc_raster_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); - - // save the impl - (*raster_builder)->impl = impl; - - // intialize impl - impl->raster_builder = (*raster_builder); - impl->runtime = runtime; - impl->cohort = NULL; - - // get config - struct skc_config const * const config = runtime->config; - - skc_extent_phrwg_thr1s_alloc(runtime,&impl->path_ids ,sizeof(skc_path_t ) * config->raster_cohort.path_ids .elem_count); - skc_extent_phw1g_tdrNs_alloc(runtime,&impl->transforms,sizeof(union skc_transform) * config->raster_cohort.transforms.elem_count); - skc_extent_phw1g_tdrNs_alloc(runtime,&impl->clips ,sizeof(union skc_path_clip) * config->raster_cohort.clips .elem_count); - skc_extent_phw1g_tdrNs_alloc(runtime,&impl->fill_cmds ,sizeof(union skc_cmd_fill ) * config->raster_cohort.fill .elem_count); - skc_extent_phrwg_tdrNs_alloc(runtime,&impl->raster_ids,sizeof(skc_raster_t ) * config->raster_cohort.raster_ids.elem_count); - - // retain the context - //skc_context_retain(context); - - (*raster_builder)->context = context; - - (*raster_builder)->add = skc_raster_builder_pfn_add; - (*raster_builder)->end = skc_raster_builder_pfn_end; - (*raster_builder)->start = skc_raster_builder_pfn_start; - (*raster_builder)->force = skc_raster_builder_pfn_force; - (*raster_builder)->release = skc_raster_builder_pfn_release; - - // initialize raster builder with host-writable buffers - (*raster_builder)->path_ids .extent = impl->path_ids.hrw; - (*raster_builder)->transforms.extent = impl->transforms.hw1; - (*raster_builder)->clips .extent = impl->clips.hw1; - (*raster_builder)->fill_cmds .extent = impl->fill_cmds.hw1; - (*raster_builder)->raster_ids.extent = impl->raster_ids.hrw; - - // - // the rings perform bookkeeping on the extents - // - // the ring snapshotting and checkpointing are necessary because - // another part of the API can _force_ the raster cohort to flush - // its work-in-progress commands but only up to a checkpointed - // boundary - // - skc_extent_ring_init(&(*raster_builder)->path_ids.ring, - config->raster_cohort.path_ids.elem_count, - config->raster_cohort.path_ids.snap_count, - sizeof(skc_path_t)); - - skc_extent_ring_init(&(*raster_builder)->transforms.ring, - config->raster_cohort.transforms.elem_count, - config->raster_cohort.transforms.snap_count, - sizeof(union skc_transform)); - - skc_extent_ring_init(&(*raster_builder)->clips.ring, - config->raster_cohort.clips.elem_count, - config->raster_cohort.clips.snap_count, - sizeof(union skc_path_clip)); - - skc_extent_ring_init(&(*raster_builder)->fill_cmds.ring, - config->raster_cohort.fill.elem_count, - config->raster_cohort.fill.snap_count, - sizeof(union skc_cmd_fill)); - - skc_extent_ring_init(&(*raster_builder)->raster_ids.ring, - config->raster_cohort.raster_ids.elem_count, - config->raster_cohort.raster_ids.snap_count, - sizeof(skc_raster_t)); - - // - // acquire kernels - // - impl->kernels.fills_expand = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_FILLS_EXPAND); - impl->kernels.rasterize_all = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL); - -#if 0 - impl->kernels.rasterize_lines = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES); - impl->kernels.rasterize_quads = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS); - impl->kernels.rasterize_cubics = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS); -#endif - - impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK); - impl->kernels.rasters_alloc = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC); - impl->kernels.prefix = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PREFIX); - - return SKC_ERR_SUCCESS; -} - -// -// -// diff --git a/src/compute/skc/raster_builder_cl_12.h b/src/compute/skc/raster_builder_cl_12.h deleted file mode 100644 index f6e1751ef1..0000000000 --- a/src/compute/skc/raster_builder_cl_12.h +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_RASTER_BUILDER_CL_12_ONCE -#define SKC_RASTER_BUILDER_CL_12_ONCE - -// -// -// - -#include "types.h" -#include "macros.h" -#include "common.h" - -// -// FIXME -- these magic numbers will be replaced with tile.h constants -// although they're probably universal across all devices -// -// FIXME -- NEED TO EVALUATE IF THIS DISTRIBUTION OF BITS IS GOING TO -// BE TOO SMALL -- plenty of room to jiggle these bits -// - -#define SKC_CMD_RASTERIZE_BITS_TRANSFORM 12 -#define SKC_CMD_RASTERIZE_BITS_CLIP 12 -#define SKC_CMD_RASTERIZE_BITS_COHORT 8 - -SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_TRANSFORM == SKC_CMD_FILL_BITS_TRANSFORM); -SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_CLIP == SKC_CMD_FILL_BITS_CLIP); -SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_COHORT == SKC_CMD_FILL_BITS_COHORT); - -// -// device-side rasterization cmd -// - -union skc_cmd_rasterize -{ - skc_ulong u64; - - skc_uint2 u32v2; - - struct { - // - // Unlike anywhere else in the pipeline, the nodeword index points - // "inside" of a path node (with word resolution). This means - // there is up to 16 GB of 32-bit word addressing in a unified - // block pool: - // - // "16GB ought to be enough for anyone" -- ASM 5/30/17 - // - skc_uint nodeword; -#if defined(__OPENCL_C_VERSION__) - skc_uint tcc; -#else - skc_uint transform : SKC_CMD_RASTERIZE_BITS_TRANSFORM; - skc_uint clip : SKC_CMD_RASTERIZE_BITS_CLIP; - skc_uint cohort : SKC_CMD_RASTERIZE_BITS_COHORT; -#endif - }; -}; - -SKC_STATIC_ASSERT(sizeof(union skc_cmd_rasterize) == sizeof(skc_uint2)); - -// -// -// - -#define SKC_CMD_RASTERIZE_HI_OFFSET_COHORT (SKC_CMD_RASTERIZE_BITS_TRANSFORM + SKC_CMD_RASTERIZE_BITS_CLIP) -#define SKC_CMD_RASTERIZE_MASK_COHORT(c) ((c).u32v2.hi & SKC_BITS_TO_MASK_AT(SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)) - -#define SKC_CMD_RASTERIZE_GET_TRANSFORM(c) ((c).u32v2.hi & SKC_BITS_TO_MASK(SKC_CMD_RASTERIZE_BITS_TRANSFORM)) -#define SKC_CMD_RASTERIZE_GET_CLIP(c) SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_CLIP,SKC_CMD_RASTERIZE_BITS_TRANSFORM) -#define SKC_CMD_RASTERIZE_GET_COHORT(c) ((c).u32v2.hi >> SKC_CMD_RASTERIZE_HI_OFFSET_COHORT) -// SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT) - -// -// -// - -#define SKC_TTSK_SIZE_COHORT (1 << SKC_CMD_RASTERIZE_BITS_COHORT) - -// -// COHORT META DATA -// - -union skc_raster_cohort_meta_in -{ - skc_uint4 u32v4; - - struct { - skc_uint blocks; // # of rk blocks - skc_uint offset; // start of rk span - skc_uint pk; // # of pk keys - skc_uint rk; // # of rk keys - }; -}; - -union skc_raster_cohort_meta_out -{ - skc_uint4 u32v4; - - struct { - skc_uint blocks; // # of blocks in raster -- initially just rk blocks - skc_uint offset; // start of rk span - skc_uint nodes; // # of nodes in raster -- necessary for walking - skc_uint keys; // # of rk & pk keys -- initially just rk - }; -}; - -union skc_raster_cohort_meta_inout -{ - union skc_raster_cohort_meta_in in; - union skc_raster_cohort_meta_out out; -}; - -// -// followed by one word for the offset -// - -struct skc_raster_cohort_meta -{ - union skc_raster_cohort_meta_inout inout[SKC_TTSK_SIZE_COHORT]; - skc_uint reads[SKC_TTSK_SIZE_COHORT]; // starting ring reads -- [0] is raster head -}; - -#define SKC_RASTER_COHORT_META_OFFSET_READS (SKC_OFFSET_OF(struct skc_raster_cohort_meta,reads) / sizeof(skc_uint)) - -// -// COHORT ATOMICS -// - -struct skc_raster_cohort_atomic -{ - // rasterization input - skc_uint cmds; - - // rasterization output - skc_uint keys; - - // block pool base -- idea here is to perform one atomic allocation - // skc_uint bp_base; -}; - -#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS 0 -#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS 1 - -#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,cmds) / sizeof(skc_uint)) -#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,keys) / sizeof(skc_uint)) - -SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS == SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC); // verify -SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS == SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC); // verify - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/rasterize.cl b/src/compute/skc/rasterize.cl deleted file mode 100644 index c9462ecff5..0000000000 --- a/src/compute/skc/rasterize.cl +++ /dev/null @@ -1,3367 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block_pool_cl.h" - -#include "atomic_cl.h" -#include "common.h" -#include "tile.h" - -// #define SKC_ARCH_AVX2 -// #define SKC_RASTERIZE_SIMD_USES_SMEM - -#define PRINTF_ENABLE 0 -#define PRINTF_BLOCK_COUNT 0 - -// -// NOTE: -// -// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT -// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE -// -// NOTE: -// -// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. -// -// - -#if 0 // SKC_ARCH_AVX2 - -// #define SKC_RASTERIZE_SUBGROUP_SIZE 1 -// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 -// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 - -// #define SKC_TTXB_WORDS 8 - -// #define SKC_RASTERIZE_FLOAT float8 -// #define SKC_RASTERIZE_UINT uint8 -// #define SKC_RASTERIZE_INT int8 -// #define SKC_RASTERIZE_PREDICATE int8 - -// #define SKC_RASTERIZE_BIN_BLOCK uint16 -// #define SKC_RASTERIZE_BIN uint8 - -// #define SKC_RASTERIZE_POOL uint8 -// #define SKC_RASTERIZE_POOL_SCALE 6 - -// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 -// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 - -// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() - -#endif - -// -// SIMT -// - -#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE -#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE -#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) - -// -// -// - -#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) -#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) - -// -// -// - -#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } -#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } - -// -// -// - -#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) -#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) -#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) -#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) -#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) -#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) - -// -// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" -// -// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ -// -// Lerp in two fma/mad ops: -// -// t * b + ((-t) * a + a) -// -// Note: OpenCL documents mix() as being implemented as: -// -// a + (b - a) * t -// -// But this may be a native instruction on some devices. For example, -// on GEN9 there is an LRP "linear interoplation" opcode but it -// doesn't appear to support half floats. -// -// Feel free to toggle this option and then benchmark and inspect the -// generated code. We really want the double FMA to be generated when -// there isn't support for a LERP/MIX operation. -// - -#if 1 -#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) -#else -#define SKC_LERP(a,b,t) mix(a,b,t) -#endif - -// -// There is no integer MAD in OpenCL with "don't care" overflow -// semantics. -// -// FIXME -- verify if the platform needs explicit MAD operations even -// if a "--fastmath" option is available at compile time. It might -// make sense to explicitly use MAD calls if the platform requires it. -// - -#if 1 -#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) -#else -#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) -#endif - -// -// -// - -#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) - -// -// -// - -union skc_bp_elem -{ - skc_uint u32; - skc_tagged_block_id_t tag_id; - skc_float coord; -}; - -// -// -// - -struct skc_subgroup_smem -{ - // - // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) - struct { - union { - - skc_uint winner; - - struct { - skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; - } aN; - - struct { - SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; - } vN; - }; - } subgroup; -#endif - - // - // work-in-progress TTSB blocks and associated YX keys - // - union { - struct { - // FIXME -- some typedefs are valid here - skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; - skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - } aN; -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - struct { - SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - SKC_RASTERIZE_BIN yx; - SKC_RASTERIZE_BIN id; - SKC_RASTERIZE_BIN count; - } vN; -#endif - } bin; -}; - -// -// -// - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) -#define skc_subgroup_lane() 0 -#else -#define skc_subgroup_lane() get_sub_group_local_id() -#endif - -// -// replenish block ids -// -// note that you can't overrun the block id pool since it's a ring -// - -static -void -skc_blocks_replenish(skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // - // get a new vector of block ids -- this is kind of a narrow - // allocation but subblocks help stretch out the pool. - // - // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids - // - skc_uint bp_idx = 0; - - if (skc_subgroup_lane() == 0) - { - bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, - SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads -#if 0 - printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); -#endif - } - - bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; - *blocks = bp_ids[bp_idx]; - *blocks_next = 0; -} - -// -// -// - -static -skc_block_id_t -skc_blocks_get_next(skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // replenish? - if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) - { - skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); - } - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); - -#else - // - // SIMD - // - skc_block_id_t id = blocks->s0; - - skc_shuffle_down_1(*blocks); - -#endif - - *blocks_next += 1; - - return id; -} - -// -// subblock allocator -// - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - -static -skc_block_id_t -skc_subblocks_get_next(skc_block_id_t * const subblocks, - skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); - } - - skc_block_id_t const sb_id = *subblocks; - - *subblocks += 1; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("= %u\n",sb_id); -#endif - - return sb_id; -} - - -#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks -#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks - -#else - -#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks -#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks - -#endif - -// -// -// - -static -skc_block_id_t -skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), - skc_uint * const blocks_next, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - skc_uint const new_yx) -{ -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, - blocks_next, - blocks, - bp_atomics, - bp_mask, - bp_ids); -#else - skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, - blocks, - bp_atomics, - bp_mask, // pow2 modulo mask for block pool ring - bp_ids); -#endif - - if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) - { - sk_v->lo = new_id; - sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; -#if 0 - printf("@ ( %3u, %3u ) %u\n", - (new_yx >> 12) & 0xFFF, - (new_yx ) & 0xFFF, - new_id); -#endif - } - - *sk_v_next += 1; - - if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) - { - *sk_v_next = 0; - - skc_uint sk_idx = 0; - - if (skc_subgroup_lane() == 0) - { - sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); -#if 0 - printf("+ %u\n",sk_idx); -#endif - } - - sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) - if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) -#endif - { - sk_extent[sk_idx] = *sk_v; -#if 0 - printf("> %u : %v2u\n",sk_idx,*sk_v); -#endif - } - } - - return new_id; -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 0 + - // -- - // 01 - SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 012 + - // ---- - // 0123 - // 01 + - // ---- - // 0123 - // - SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 0123456 + - // -------- - // 01234567 - // 012345 + - // -------- - // 01234567 - // 0123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); - SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 0123456789abcde + - // ---------------- - // 0123456789abcdef - // 0123456789abcd + - // ---------------- - // 0123456789abcdef - // 0123456789ab + - // ---------------- - // 0123456789abcdef - // 01234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); - SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); - SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_add(v); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 0 + - // -- - // 01 - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 012 + - // ---- - // 0123 - // 01 + - // ---- - // 0123 - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 0123456 + - // -------- - // 01234567 - // 012345 + - // -------- - // 01234567 - // 0123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); - SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 0123456789abcde + - // ---------------- - // 0123456789abcdef - // 0123456789abcd + - // ---------------- - // 0123456789abcdef - // 0123456789ab + - // ---------------- - // 0123456789abcdef - // 01234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); - SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); - SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_add(v); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 00 max - // -- - // 01 - SKC_RASTERIZE_UINT const w = max(v.s00,v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 0012 + - // ---- - // 0123 - // 0101 + - // ---- - // 0123 - // - SKC_RASTERIZE_UINT const w = max(v.s0012,v); - SKC_RASTERIZE_UINT const x = max(w.s0101,w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 00123456 + - // -------- - // 01234567 - // 01012345 + - // -------- - // 01234567 - // 01230123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_UINT const w = max(v.s00123456,v); - SKC_RASTERIZE_UINT const x = max(w.s01012345,w); - SKC_RASTERIZE_UINT const y = max(x.s01230123,x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 00123456789abcde + - // ---------------- - // 0123456789abcdef - // 010123456789abcd + - // ---------------- - // 0123456789abcdef - // 01230123456789ab + - // ---------------- - // 0123456789abcdef - // 0123456701234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); - SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); - SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); - SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_max(v); - -#endif -} - -// -// -// - -static -float -skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return v.s1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return v.s3; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return v.s7; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return v.sf; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return v.s1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return v.s3; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return v.s7; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return v.sf; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); - -#endif -} - -// -// -// - -static -float -skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#else - return v.s0; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,0); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, - SKC_RASTERIZE_UINT const i) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#else - return shuffle(v,i); -#endif - -#else - // - // SIMT - // - return intel_sub_group_shuffle(v,i); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous - SKC_RASTERIZE_FLOAT const c) // current -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // FIXME -- there are alternative formulations here: - // - // Option 1: - // - // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) - // - // Option 2: - // - // p is a scalar - // t = c.rotate(+1) - // t.s0 = p; - // - // Option 3: ... - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return p; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return shuffle2(p,c,(uint2)(1,2)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return shuffle2(p,c,(uint4)(3,4,5,6)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); -#endif - -#else - // - // SIMT - // - return intel_sub_group_shuffle_up(p,c,1); - -#endif -} - -// -// -// - -static -bool -skc_is_lane_first() -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) - // - // SIMD - // - return true; -#else - // - // SIMT - // - return get_sub_group_local_id() == 0; -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_delta_offset() -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return 1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); -#endif - -#else - // - // SIMT - // - return 1.0f + get_sub_group_local_id(); - -#endif - -} - -// -// -// - -static -int -skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - return any(p); -#else - // - // SIMT - // - return sub_group_any(p); -#endif -} - -// -// -// - -#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) - -void -skc_segment_next(__global union skc_bp_elem * const bp_elems, - skc_uint * const nodeword, - skc_block_id_t * const id) -{ - if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) - { - *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; - } - - skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; - - *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - } -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) -{ - return native_sqrt(x * x + y * y); -} - -// -// Wang's Formula (1985) -// - -#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned - -#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) - -#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) -#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) - -#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) -#define SKC_WANG_SQRT(x) native_sqrt(x) - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, - SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, - SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, - SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) -{ - // - // Return the number of evenly spaced (in the parametric sense) line - // segments that are guaranteed to be within "epsilon" error of the - // curve. - // - // We're then going to take multiples of the reciprocal of this - // number so that the segmentation can be distributed across the - // subgroup. - // - // Note, this can probably be slightly optimized per architecture - // but it's probably far from being a hotspot since it's all - // straight-line unpredicated code. - // - // The result is an integer ranging from [1.0,#segments] - // - // Note that even if all of the control points are coincident, the - // max(1.0f) will categorize this as a line of 1 segment. - // - // This is what we want! We want to convert cubics to lines as - // easily as possible and *then* cull lines that are either - // horizontal or zero length. - // - return max(1.0f, - ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * - SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), - fabs(t3x - 2.0f * t2x + t1x)), - max(fabs(t2y - 2.0f * t1y + t0y), - fabs(t3y - 2.0f * t2y + t1y)))))); -} - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, - SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, - SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) -{ - return max(1.0f, - ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * - SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x), - fabs(t2y - 2.0f * t1y + t0y))))); -} - -// -// rational curves -// - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_cubic_rat() -{ - return 0.0f; -} - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_quad_rat() -{ - return 0.0f; -} - -// -// flush any work-in-progress blocks and return unused block ids -// - -static -void -skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_v_t * const blocks, - skc_uint const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem) -{ - // - // flush non-empty bins - // - // FIXME -- accelerate this iteration/search with a subgroup operation - // - for (skc_uint ii=0; iibin.aN.count[ii] > 0) - { - skc_block_id_v_t const id = smem->bin.aN.id[ii]; - skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; -#if 0 - printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); -#endif - bp_elems[idx].u32 = tts; - } - - // - // FIXME -- vectorize with vstoreN() - // - } - - // - // return remaining block ids back to the pool - // - skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; - - if (blocks_rem > 0) - { - skc_uint bp_idx = 0; - - if (skc_subgroup_lane() == 0) - { - bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); - -#if 0 - printf("r-: %8u + %u\n",bp_idx,blocks_rem); -#endif - } - - bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; - - if (skc_subgroup_lane() >= blocks_next) - { - bp_ids[bp_idx] = *blocks; - } - } - - // - // flush work-in-progress ryx keys - // - if (sk_v_next > 0) - { - skc_uint sk_idx = 0; - - if (skc_subgroup_lane() == 0) - { - sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); -#if 0 - printf("* %u\n",sk_idx); -#endif - } - - sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); - - if (skc_subgroup_lane() < sk_v_next) - { - sk_extent[sk_idx] = *sk_v; - } - } -} - -// -// If there are lanes that were unable to append to a bin because -// their hashes collided with a bin's current ryx key then those bins -// must be ejected. -// -// Note that we do not eject "full" bins because lazily waiting for a -// collision results in simpler code. -// - -static -void -skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_t * const subblocks, - skc_block_id_v_t * const blocks, - skc_uint * const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_UINT const hash, - SKC_RASTERIZE_UINT const yx, - SKC_RASTERIZE_PREDICATE is_collision) // pass by value -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - - // - // FIXME -- this code is now stale with the changes to the - // subblock/block allocation strategy - // - - // - // get local TTSB ID queue count - // - skc_uint ttsb_id_count = smem->pool.count; // scalar - - // init hash bit mask - skc_uint component_mask = 0; - - for (int cc=0; ccbin.aN.count[winner] > 0) - { - skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - - bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; - } - - // - // ensure there is at least one TTSK and TTSB ID - // - if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) - { - // - // update remaining count - // - ttsb_id_count = 0; - - // - // flush accumulated ttsk_ryx keys - // - uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count - -#if 0 - printf("# %u\n",idx); -#endif - - for (uint ii=0; iipool.aN.id[ii] = bp_ids[id + ii]; - } - - // - // invalidate the winning block - // - - // - // update bin with winning yx, new ttsb id and zero count - // - // all lanes are loading/storing from/to the same index - // - smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); - smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; - smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; - smem->bin.aN.count[winner] = 0; - - // - // update count - // - ttsb_id_count += 1; - } - - // - // save count - // - smem->pool.count = ttsb_id_count; - -#else - // - // SIMT - // - - do { - // - // only one lane will win! - // - if (is_collision) - smem->subgroup.winner = hash; - - barrier(CLK_LOCAL_MEM_FENCE); - - // - // which bin is being ejected? - // - skc_uint const winner = smem->subgroup.winner; - - // - // which colliding hash is taking over the bin? - // - SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); - - // - // all lanes with the same hash will try to store but only one - // lane will win - // - if (is_winner) - smem->subgroup.winner = yx; - - barrier(CLK_LOCAL_MEM_FENCE); - - // - // flush this block to the pool - // - if (smem->bin.aN.count[winner] > 0) - { - skc_block_id_v_t const id = smem->bin.aN.id[winner]; - skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; -#if 0 - printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); -#endif - bp_elems[idx].u32 = tts; - } - - // - // append new ttsk - // - skc_uint const new_yx = smem->subgroup.winner; - skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), - blocks_next, - bp_atomics, - bp_mask, // pow2 modulo mask for block pool ring - bp_ids, - cohort_atomics, - sk_v, - sk_v_next, - sk_extent, - new_yx); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf(">>> %9u\n",new_id); - } -#endif - - // - // update bin with winning yx, new ttsb id and zero count - // - smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; - smem->bin.aN.yx [winner] = new_yx; - smem->bin.aN.id [winner] = new_id; - smem->bin.aN.count[winner] = 0; - - // - // remove all lanes matching this hash - // - is_collision = is_collision && !is_winner; - - // - // exit if nothing left to do - // - } while (sub_group_any(is_collision)); - -#endif -} - -// -// scatter scan max -// -static -SKC_RASTERIZE_UINT -skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_FLOAT const iss, - SKC_RASTERIZE_FLOAT const ess) -{ - // - // prefix sums determine which lanes we're going to work on next - // - SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); - SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#ifdef SKC_RASTERIZE_SIMD_USES_SMEM - // - // SIMD APPROACH 1: SIMT'ISH - // - - // zero the volatile smem scratchpad using vector syntax - smem->subgroup.vN.scratch[0] = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_scratch_store C) \ - smem->subgroup.aN.scratch[scratch_idx C] = I; - - SKC_RASTERIZE_VECTOR_EXPAND(); - - // propagate lanes to right using max scan - SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; - SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); - -#else - // - // SIMD APPROACH 2: SCALAR'ISH - // - - SKC_RASTERIZE_UINT source = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_scratch_store C) \ - ((uint *)&source)[scratch_idx C] = I; - - SKC_RASTERIZE_VECTOR_EXPAND(); - - for (uint ii=1; iisubgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); - - // - // store source lane at starting lane - // - if (is_scratch_store) - smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); - - // - // propagate lanes to right using max scan - // - SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; - SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); -#endif - - return source; -} - -// -// sliver lines into subpixels -// - -static -void -skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_t * const subblocks, - skc_block_id_v_t * const blocks, - skc_uint * const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_FLOAT const l0x, - SKC_RASTERIZE_FLOAT const l0y, - SKC_RASTERIZE_FLOAT const l1x, - SKC_RASTERIZE_FLOAT const l1y) -{ - // - // Y-SLIVERING - // ----------- - // - // immediately sliver all multi-pixel lines in into 1-pixel high - // lines - // - // note this implicitly squelches horizontal lines - // - // there is another test for horizontal lines after x-slivering - // is complete - // - - // - // will we need to flip the sign of y_delta ? - // - SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); - SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; - - // - // save 1/dy - // - SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); - - // - // how many non-horizontal subpixel y-axis slivers are there? - // - SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; - SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; - - // - // inclusive subgroup scan of y_segs - // - SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); - SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; - float y_rem = skc_subgroup_last_float(y_iss); - - // - // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails - // - if (y_segs == 0.0f) - y_iss = 0.0f; - -#if 0 - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); -#endif - - // - // these values don't matter on first iteration - // - SKC_RASTERIZE_FLOAT n1x_prev = 0; - SKC_RASTERIZE_FLOAT n1y_prev = 0; - - // - // loop until done - // - while (y_rem > 0.0f) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); - - // - // get line at y_source line - // - SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); - SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); - SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); - SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); - - // - // every lane will create a 1 pixel tall line "sliver" - // - // FIXME -- this gets expanded on SIMD - // - // if numerator == 1 then this is the first lane - // if numerator == s then this is the last lane - // - SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); - SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); - - SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); - - // toggle y_delta sign - SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); - - // - // calculate "right" line segment endpoint - // - SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; - SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); - SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); - - // - // override c1 if this is last point - // - n1y = select(n1y,m1y,is_y_last); - n1x = select(n1x,m1x,is_y_last); - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); - SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); - - // - // override shuffle up if this is the first line segment - // - n0y = select(n0y,m0y,is_y_first); - n0x = select(n0x,m0x,is_y_first); - - // - // save previous right endpoint - // - n1x_prev = n1x; - n1y_prev = n1y; - - // - // decrement by subgroup size - // - y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - -#if 0 - // - // debug - // - if (n0y != n1y) { - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); - } -#endif - - // - // X-SLIVERING - // ----------- - // - // now sliver 1-pixel high lines into at either vertical or - // 1-pixel wide lines - // - // save original direction and work with increasing x - // - SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); - SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; - - // - // save 1/dy - // - SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); - - // - // how many non-horizontal subpixel y-axis slivers are there? - // - SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; - SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); - - // - // inclusive subgroup scan of y_segs - // - SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); - SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; - float x_rem = skc_subgroup_last_float(x_iss); - - // - // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails - // - //if (x_segs == 0.0f) - // x_iss = 0.0f; - - // - // these values don't matter on first iteration - // - SKC_RASTERIZE_FLOAT p1x_prev = 0; - SKC_RASTERIZE_FLOAT p1y_prev = 0; - - // - // loop until done - // - while (x_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); - - // - // get line at y_source line - // - SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); - SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); - SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); - SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); - - // - // every lane will create a 1 pixel tall line "sliver" - // - // FIXME -- this gets expanded on SIMD - // - // if numerator == 1 then this is the first lane - // if numerator == s then this is the last lane - // - SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); - SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); - - SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); - - // toggle x_delta sign - SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); - - // - // calculate "right" line segment endpoint - // - SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; - SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); - SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); - - // - // override c1 if this is last point - // - p1x = select(p1x,o1x,is_x_last); - p1y = select(p1y,o1y,is_x_last); - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); - SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); - - // - // override shuffle up if this is the first line segment - // - p0x = select(p0x,o0x,is_x_first); - p0y = select(p0y,o0y,is_x_first); - - // - // save previous right endpoint - // - p1x_prev = p1x; - p1y_prev = p1y; - - // - // decrement by subgroup size - // - x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // only non-horizontal subpixel lines are valid - // - SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); - - // - // if no lanes are active then continue - // - // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY - // IMPACTS PERFORMANCE (+12% ?) - // - // IT SHOULDN'T !!! - // -#if 0 - if (!skc_subgroup_any(is_active)) - continue; -#endif - - // - // Option 1: use SLM for explicitly managed coalesced stores - // - // 1. which tile does this line belong? - // 2. hash tile coordinates - // 3. lookup hash - // 4. if tile matches then SLM append keys - // 5. if tile doesn't match - // a. flush - // b. create new TTSK_RYX - // c. obtain TTSB block from pool - // d. goto 3. - // - - // - // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores - // - // 1. which tile does this line belong? - // 2. hash tile coordinates - // 3. lookup hash - // 4. if tile matches then GMEM append keys - // 5. if tile doesn't match - // a. flush (and invalidate empty elems) - // b. create new TTSK_RYX - // c. obtain TTSB block from pool - // d. goto 3. - // - - // - // The virtual rasterization surface is very large and - // signed: +/- ~64K-256K, depending on the architecture. - // - // Rasters must be clipped to the virtual surface and, - // optionally, clipped even further on a per raster - // basis. - // - - // - // Clip to the per-raster clip - // - - /* - - CLIP HERE - - */ - - // - // Hash the tile coordinates - // - // This table lists nominal values for each architecture. - // We want to choose values that are naturally fit the - // "width" of the architecture. - // - // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS - // ---- ------- ---- --------- -------- --------- - // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? - // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* - // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? - // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* - // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon - // - // NOTE: When possible, bias the hash toward using more y - // bits because of: - // - // 1. the 90 degree counter-clockwise rotation that we put - // in place to offset the render-time clockwise - // rotation - // - // 2. the likely presence of left-to-right or - // right-to-left glyphs. - // - // For power-of-two bins, the hash is easy. - // - // For non-power-of-two, we may want to either implement a - // fast mod (compiler should do this for us... hahahaha) or - // drop down to the next power-of-two. - // - - // - // FIXME -- this snarl is not good -- can probably reduce - // some of the sign casting but some is there to vectorize a - // scalar - // - SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); - SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); - - SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); - SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); - - SKC_RASTERIZE_INT const min_y = min(z0y,z1y); - SKC_RASTERIZE_INT const max_y = max(z0y,z1y); - - SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; - - SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; - SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); - - // - // map [+1,+32] to [ 0,+31] - // map [-1,-32] to [-1,-32] - // - SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; - - SKC_RASTERIZE_INT const min_x = min(z0x,z1x); - SKC_RASTERIZE_INT const max_x = max(z0x,z1x); - SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; - - SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; - SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); - - SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; - - SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | - (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); - - SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); - -#if 0 - printf("(%3u, %3u)\n",tile_y,tile_x); -#endif - -#if 0 - if (is_active) - printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); -#endif - - // - // debug - // -#if 0 // PRINTF_ENABLE - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_active C) \ - printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); - - SKC_RASTERIZE_VECTOR_EXPAND(); -#else - if (is_active) - printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); -#endif - -#endif - // - // flush all active lanes - // - while (true) - { - // - // either gather load or vector load+shuffle the yx keys - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; - SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); -#else - SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; -#endif - - // - // does yx for lane match yx for hash? - // - SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; - SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); - - // - // OpenCL spec: "When casting a bool to a vector integer - // data type, the vector components will be set to -1 - // (i.e. all bits set) if the vector bool value is true - // and 0 otherwise. - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; -#else - SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} -#endif - // - // how many new elements for each matching hash bin? - // - SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; - SKC_RASTERIZE_UINT const h = h_match << h_shl; - - // - // prefix sum all of the bins in parallel - // - SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); - SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); - - // - // current bin counts - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; - SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); -#else - SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; -#endif - - // - // calculate where each cache-hit and in-bounds tts should be stored - // - SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; - SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; - - // - // which lanes can append to a matching bin? - // - SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); - - // - // scatter append tts elements to bin blocks - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) - // - // SIMD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_append C) \ - { \ - smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ - smem->bin.aN.count[hash C] = count_new C; \ - } - - SKC_RASTERIZE_VECTOR_EXPAND(); -#else - // - // SIMT - // - if (is_append) - { - smem->bin.aN.ttsb [hash][ttsb_index] = tts; - smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS - } -#endif - // - // try to keep predicate updates SIMD-friendly and - // outside of predicated code paths -- this is not - // always how we would normally do things on SIMT but - // either approach is acceptable - // - - // - // mask off lanes/components that successfully appended - // - is_active = is_active && !is_append; - - // - // are there any active lanes left? - // - if (!skc_subgroup_any(is_active)) - break; - - // - // There are active lanes that couldn't be appended to a - // bin because their hashes collided with the bin's - // current ryx key then those bins must be ejected. - // - // Note that we do not eject "full" bins because lazily - // waiting for a collision results in simpler code. - // - skc_flush(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - subblocks, - blocks, - blocks_next, - sk_v, - sk_v_next, - sk_extent, - smem, - hash, - yx, - is_active); - } - } - } -} - -// -// INITIALIZE SMEM -// -// Note that SIMD/SIMT have nearly the same syntax. -// -static -void -skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) -{ - // - // initialize smem bins - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); - smem->bin.vN.count = ( 0 ); -#else - // - // SIMT - // - int idx = skc_subgroup_lane(); - -#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) - if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) -#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) - for (; idxbin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); - smem->bin.aN.count[idx] = ( 0 ); - } -#endif -} - -// -// RASTERIZE CUBIC KERNEL -// - -static -void -skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only support perspective later - // - // the affine transformation requires 8 FMA + 2 ROUND operations - // - SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty); - - // - // - // -#if PRINTF_ENABLE - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ - " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ - b0x C,b0y C,t1x C,t1y C, \ - t2x C,t2y C,t3x C,t3y C); - - SKC_RASTERIZE_VECTOR_EXPAND(); - -#else - - printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", - b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); - -#endif - -#endif - - // - // OLD APPROACH - // ------------ - // - // The Spinel CUDA rasterizer was significantly more complex and - // performed a few different tasks that are probably best kept - // separate. - // - // The Spinel rasterizer Bezier held 4-element x and y coordinates - // in adjacent lanes. This simplified intermingling of single lane - // 4-coordinate line segments with two-lane cubic Beziers. - // - // After transformation of the input segments, the Spinel rasterizer - // would test cubics for flatness and, if flat, collapse the - // adjacent lanes into a single line lane and an empty lane. - // - // Any lines would then be appended to a line queue. - // - // Any cubics would then be subdivided. - // - // The reclassification process would be repeated. - // - // NEW APPROACH - // ------------ - // - // Assume we're only working with cubics in this kernel. - // - // Optimization: if the line segment is a special case -- a cusp, - // has 1+ inflections, or a loop -- it might be beneficial to - // subdivide the control cage 1+ times in order to separate the - // flatter segments the high-velocity region(s). - // - // This means we want to split using [a,b] formulation to _directly_ - // subdivide producing a new control cage. - // - // Wang's Formula is still useful even if we subdivide once or twice - // as it's so cheap that it might give some useful hints about where - // the high-velocity sections of curve reside. - // - // But it seems like using Wang's and directly flattening to line - // segments without any subdivision is good enough for the limited - // set of test cases that I've tried. - // - // So... use Wang's Formula to estimate how many line segment are - // required to properly flatten the cubics. - // - // Then use inclusive/exclusive scans to put all the lanes to work: - // - // 1. segmenting cubics to line segments - // - // 2. slivering line segments into 1-pixel high line segments - // - // 3. slivering 1-pixel high line segments into 1-pixel wide line - // segments - // - // MORE BACKGROUND ON NEW APPROACH - // ------------------------------- - // - // Two options for handling line segments: - // - // 1. append the line segments onto an SLM array until enough - // work has been accrued (Spinel does this) - // - // 2. immediately sliver the potentially multi-pixel line - // segments into subpixel lines - // - // The advantage of (1) is that it guarantees the slivering - // process will, on average, always be emitting a full subgroup - // of subpixel lines. - // - // The advantage of (2) is that it reduces code complexity and - // leaves more room for SLM tile bins. The difference between Spinel - // and Skia Compute is that Wang's Formula guarantees there will be - // a full subgroup of multi-pixel lines unless this is the final - // iteration of the warp of multi-pixel lines. - // - // Note that wider GPU architectures might benefit from (1) and - // other work accumulation strategies because it will minimize - // partial warp workloads in the final iteration of each stage. It - // also minimizes the sunk cost of the uniform control logic steps. - // - // So let's implement (2) for now... - // - - // - // And... begin! - // - // Estimate how many line segments are in quad/cubic curve. - // - // Wang's Formula will return zero if the control points are - // collinear but we bump it up to 1.0f. - // - SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); - - // - // if there are free registers then precalculate the reciprocal for - // each estimated segments since it will never change - // - SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); - - - // - // inclusive add scan of estimated line segments - // exclusive add scan of estimated line segments - // total number of estimated line segments - // - SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); - SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; - float s_rem = skc_subgroup_last_float(s_iss); // scalar - - // - // Precompute cubic polynomial coefficients from transformed control - // cage so we can shuffle them in on each iteration of the outer - // loop and then evaluate the polynomial in Horner form. - // - // | 1 0 0 0 | | c0 | - // | | | | - // | -3 3 0 0 | | c1 | - // B(t) = [ 1 t^1 t^2 t^3 ] | | | | - // | 3 -6 3 0 | | c2 | - // | | | | - // | -1 3 -3 1 | | c3 | - // - // - SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL - SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL - - SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL - SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL - - SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB - SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB - - // - // these values don't matter on the first iteration - // - SKC_RASTERIZE_FLOAT l1x_prev = 0; - SKC_RASTERIZE_FLOAT l1y_prev = 0; - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // loop until done - // - while (s_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); - - // - // every lane has a fraction to work off of - // - // FIXME -- this gets expanded on SIMD - // - // if delta == 1 then this is the first lane - // if count == s_segs then this is the last lane - // - SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); - SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); - - SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); - - // - // init parametric t - // - SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? - - // - // if last then override to a hard 1.0f - // - s_t = is_s_last ? 1.0f : s_t; - - // - // decrement by subgroup size - // - s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // now every lane knows what to do and the following lines will - // pump out up to SUBGROUP_SIZE line segments - // - // obtain the src vertices through shared or via a shuffle - // - - // - // shuffle in the polynomial coefficients their source lane - // - SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); - SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); - - SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); - SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); - - SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); - SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); - - SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); - SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); - - // - // calculate "right" line segment endpoint using Horner form - // - SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND - SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); - SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); - - // - // save previous right endpoint - // - l1x_prev = l1x; - l1y_prev = l1y; - - // - // override shuffle up if this is the first line segment - // - l0x = select(l0x,s0x,is_s_first); - l0y = select(l0y,s0y,is_s_first); - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - } - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// RASTERIZE QUAD KERNEL -// - -static -void -skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only support perspective later - // - // the affine transformation requires 8 FMA + 2 ROUND operations - // - SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty); - - // - // Estimate how many line segments are in quad/cubic curve. - // - // Wang's Formula will return zero if the control points are - // collinear but we bump it up to 1.0f. - // - SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); - - // - // if there are free registers then precalculate the reciprocal for - // each estimated segments since it will never change - // - SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); - - - // - // inclusive add scan of estimated line segments - // exclusive add scan of estimated line segments - // total number of estimated line segments - // - SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); - SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; - float s_rem = skc_subgroup_last_float(s_iss); // scalar - - // - // Precompute quadratic polynomial coefficients from control cage so - // we can shuffle them in on each iteration of the outer loop and - // then evaluate the polynomial in Horner form. - // - - // | 1 0 0 | | c0 | - // | | | | - // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | - // | | | | - // | 1 -2 1 | | c2 | - // - // - SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL - SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL - - SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD - SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD - - // - // these values don't matter on the first iteration - // - SKC_RASTERIZE_FLOAT l1x_prev = 0; - SKC_RASTERIZE_FLOAT l1y_prev = 0; - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // loop until done - // - while (s_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); - - // - // every lane has a fraction to work off of - // - // FIXME -- this gets expanded on SIMD - // - // if delta == 1 then this is the first lane - // if count == s_segs then this is the last lane - // - SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); - SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); - - SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); - - // - // init parametric t - // - SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? - - // - // if last then override to a hard 1.0f - // - s_t = is_s_last ? 1.0f : s_t; - - // - // decrement by subgroup size - // - s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // now every lane knows what to do and the following lines will - // pump out up to SUBGROUP_SIZE line segments - // - // obtain the src vertices through shared or via a shuffle - // - - // - // shuffle in the polynomial coefficients their source lane - // - SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); - SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); - - SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); - SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); - - SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); - SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); - - // - // calculate "right" line segment endpoint using Horner form - // - SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND - SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); - SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); - - // - // save previous right endpoint - // - l1x_prev = l1x; - l1y_prev = l1y; - - // - // override shuffle up if this is the first line segment - // - l0x = select(l0x,s0x,is_s_first); - l0y = select(l0y,s0y,is_s_first); - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - } - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// RASTERIZE LINE KERNEL -// - -static -void -skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - -#if 0 - // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y); - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); -#endif - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only - // FIXME -- support perspective later - // - // the affine transformation requires 8 FMA + 4 ROUND operations - // - SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty); - -#if 0 - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); -#endif - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("+cmd_idx = %u\n",cmd_idx); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("-cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("[ %u ]< %u, %u, %u, %u >\n", - cmd_idx, - cmd.nodeword, - SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), - SKC_CMD_RASTERIZE_GET_CLIP(cmd), - SKC_CMD_RASTERIZE_GET_COHORT(cmd)); -#endif - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - switch (tag) - { - case SKC_BLOCK_ID_TAG_PATH_LINE: - skc_rasterize_lines(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_QUAD: - skc_rasterize_quads(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_CUBIC: - skc_rasterize_cubics(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: - break; - case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: - break; - - default: - break; - } -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_lines(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_quads(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_cubics(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - ; -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - ; -} - -// -// -// diff --git a/src/compute/skc/rasters_alloc.cl b/src/compute/skc/rasters_alloc.cl deleted file mode 100644 index f8f76a7b39..0000000000 --- a/src/compute/skc/rasters_alloc.cl +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "raster.h" -#include "tile.h" - -// -// There is a fixed-size meta table per raster cohort that we use to -// peform a mostly coalesced sizing and allocation of blocks. -// -// This code is simple and fast. -// - -__kernel -SKC_RASTERS_ALLOC_KERNEL_ATTRIBS -void -skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global skc_block_id_t const * const bp_ids, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t * const map, - __global skc_uint * const metas, - __global skc_uint const * const raster_ids, // FIXME -- CONSTANT - skc_uint const count) -{ - // access to the meta extent is linear - skc_uint const gid = get_global_id(0); - skc_bool const is_active = gid < count; - - // - // init with defaults for all lanes - // - union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } }; - skc_uint raster_id = SKC_UINT_MAX; - skc_uint extra_blocks = 0; - - if (is_active) - { - // load meta_in - meta.in.u32v4 = vload4(gid,metas); - - // load raster_id as early as possible - raster_id = raster_ids[gid]; - -#if 0 - printf("%3u + %5u, %5u, %5u, %5u\n", - gid, - meta.in.blocks, - meta.in.offset, - meta.in.pk, - meta.in.rk); -#endif - - // how many blocks will the ttpb blocks consume? - extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / - SKC_DEVICE_SUBBLOCKS_PER_BLOCK); - - // total keys - meta.out.keys += meta.in.pk; - - // how many blocks do we need to store the keys in the head and trailing nodes? - skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) / - (SKC_RASTER_NODE_DWORDS - 1)); - // increment blocks - extra_blocks += hn; - - // how many nodes trail the head? - meta.out.nodes = hn - 1; - - // update blocks - meta.out.blocks += extra_blocks; - -#if 0 - printf("%3u - %5u, %5u, %5u, %5u\n", - gid, - meta.out.blocks, - meta.out.offset, - meta.out.nodes, - meta.out.keys); -#endif - } - - // - // allocate blocks from block pool - // - // first perform a prefix sum on the subgroup to reduce atomic - // operation traffic - // - // note this idiom can be implemented with vectors, subgroups or - // workgroups - // - - skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks); - skc_uint reads = 0; - - // last lane performs the block pool allocation with an atomic increment - if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) { - reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads - } - - // broadcast block pool base to all lanes - reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1); - - // update base for each lane - reads += prefix - extra_blocks; - - // - // store meta header - // - if (is_active) - { - // store headers back to meta extent - vstore4(meta.out.u32v4,gid,metas); - - // store reads - metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; - - // get block_id of each raster head - skc_block_id_t const block_id = bp_ids[reads & bp_mask]; - - // update map - map[raster_id] = block_id; - -#if 0 - printf("alloc: %u / %u\n",raster_id,block_id); -#endif - } -} - -// -// -// diff --git a/src/compute/skc/rasters_reclaim.cl b/src/compute/skc/rasters_reclaim.cl deleted file mode 100644 index f0abdb0381..0000000000 --- a/src/compute/skc/rasters_reclaim.cl +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "block.h" -#include "raster.h" -#include "common.h" -#include "tile.h" - -// -// -// - -#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS) - -#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS) - -// -// -// - -#if ( SKC_RASTERS_RECLAIM_X == 1 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_RASTERS_RECLAIM_X == 2 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_RASTERS_RECLAIM_X == 4 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_RASTERS_RECLAIM_X == 8 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_RASTERS_RECLAIM_X == 16) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_RASTERS_RECLAIM_X" -#endif - -#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \ - (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) - -#endif - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// COMPILE-TIME PREDICATES -// - -#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \ - SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I) - -#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \ - SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I) - -// -// RUN-TIME PREDICATES -// - -#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \ - (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS) - -// -// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL -// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK -// COMBOS (NOT NECESSARILY POW2) -// -// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR -// UINT TYPE INSTEAD OF A ULONG. -// - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint - -// -// -// - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ - (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ - ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ - S = sub_group_scan_exclusive_add(C) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \ - (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK) - -// -// -// - -struct skc_reclaim -{ - skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE]; -}; - -__kernel -SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS -void -skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring - __global skc_uint * const bp_elems, // block pool blocks - __global skc_uint volatile * const bp_atomics, // read/write atomics - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const map, // raster host-to-device map - struct skc_reclaim const reclaim) // array of host raster ids -{ -#if (__OPENCL_VERSION__ < 200) - skc_uint const reclaim_stride = get_num_sub_groups(); -#else - skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); - -#if 0 - // - // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT - // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL - // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE - // RECLAMATION JOB ON THE REST OF THE PIPELINE. - // - for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) -#endif - { - // get host raster id - skc_raster_h const raster = reclaim.aN[reclaim_idx]; - - // get block id of raster header - skc_block_id_t id = map[raster]; - - // - // load all of the head block ttxk.lo keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - // load raster header counts -- we only need the blocks and - // nodes words the keys are doublewords. - // - // FIXME -- this can be made portable with compile-time macro expansion - // - skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES - skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes); - } -#endif - // - // acquire a span in the block pool ids ring for reclaimed ids - // - skc_uint bp_ids_base = 0; - - if (get_sub_group_local_id() == 0) { - bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); - } - - bp_ids_base = sub_group_broadcast(bp_ids_base,0); - - // - // mask off everything but the block id - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - h##I = h##I & SKC_TTXK_LO_MASK_ID; \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; -#if 0 - printf("rasters next = %u\n",id); -#endif - } - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%08X %u\n",h##I,h##I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - printf("%08X\n",h##I); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - - // - // - we'll skip subgroups that are entirely header - // - // - but we need to mark any header elements that partially fill - // a subgroup as subblocks - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \ - if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \ - h##I = SKC_UINT_MAX; \ - } \ - } \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - { - // - // count reclaimable blocks in each lane - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = h##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - } - - // printf("R %7u ! %u\n",bp_ids_idx,h##I); - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, walk the nodes - // - do { - // id of next block is in last lane - id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); - - // - // load all of the node block ttxk.lo keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // mask off everything but the block id - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - n##I = n##I & SKC_TTXK_LO_MASK_ID; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; -#if 0 - printf("rasters next = %u\n",id); -#endif - } - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%08X %u\n",n##I,n##I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - - // - // count reclaimable blocks in each lane - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = n##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // printf("R %7u ! %u\n",bp_ids_idx,n##I); - - // any more nodes? - } while (--count_nodes > 0); - } -} - -// -// -// diff --git a/src/compute/skc/render.cl b/src/compute/skc/render.cl deleted file mode 100644 index ba2fd7bbfd..0000000000 --- a/src/compute/skc/render.cl +++ /dev/null @@ -1,2165 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "block.h" -#include "tile.h" -#include "atomic_cl.h" -#include "styling_types.h" - -// -// -// - -#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) - -// -// -// - -#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 -#endif - -// -// tile state flag bits -// - -typedef enum skc_tile_flags_e { - - // FLUSH - SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, - SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, - SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, - - // OPACITY - SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, - - // - // Note: testing for opacity and skipping scattering is on its way - // to becoming a much more programmable option because sometimes we - // may be compositing/blending from back-to-front and/or be using - // group blend rules that ignore opacity. - // - // The point is that all of these decisions should be encoded in - // styling commands and, as much as possible, removed from the final - // group/layer styling traversal render loop. - // - -} skc_tile_flags_e; - -// -// COVER -- assumes availability of either fp16 or fp32 -// - -union skc_tile_cover -{ - struct { - SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; - } aN; - -#ifdef SKC_RENDER_TILE_COVER_VECTOR - struct { - SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; - } vN; -#endif -}; - -// -// COLOR -- assumes availability of either fp16 or fp32 -// - -union skc_tile_color -{ - union { - struct { - SKC_RENDER_TILE_COLOR r; - SKC_RENDER_TILE_COLOR g; - SKC_RENDER_TILE_COLOR b; - SKC_RENDER_TILE_COLOR a; - } rgba[SKC_TILE_WIDTH]; - } aN; - -#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED - union { - SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; - } iN; -#endif - -#ifdef SKC_RENDER_TILE_COLOR_VECTOR - union { - SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; - } vN; -#endif - - struct { - union { - struct { - SKC_RENDER_TILE_COLOR r; - SKC_RENDER_TILE_COLOR g; - }; - SKC_RENDER_GRADIENT_FLOAT distance; - }; - union { - struct { - SKC_RENDER_TILE_COLOR b; - SKC_RENDER_TILE_COLOR a; - }; - SKC_RENDER_GRADIENT_FLOAT stoplerp; - }; - } grad[SKC_TILE_WIDTH]; -}; - -// -// SHARED MEMORY STATE -// - -#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) - -#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) -#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) - -// -// -// - -union skc_subgroup_smem -{ - // - // The tiles are stored in column-major / height-major order - // - // The final column is a guard column that is OK to write to but - // will never be read. It simplifies the TTSB scatter but could be - // predicated if SMEM is really at a premium. - // -#if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) - struct { - SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] - } atomic; -#endif - - struct { - int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] - } aN; - - struct { // assumption is that height = subgroup - SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; - } vN; - - struct { // assumption is that height = subgroup - SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; - } wide; - - union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; - - half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; - -#if 0 - // - // SPILL TO GMEM - // -#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) - struct { - -#if (SKC_REGS_COLOR_S > 0) - union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; -#endif - -#if (SKC_REGS_COVER_S > 0) - union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; -#endif - - } regs; -#endif - // - // - // -#endif -}; - -// -// -// - -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - -#define skc_subgroup_lane() 0 - -#else - -#define skc_subgroup_lane() get_sub_group_local_id() - -#endif - -// -// -// - -typedef skc_uint skc_ttsk_lo_t; -typedef skc_uint skc_ttsk_hi_t; - -typedef skc_uint skc_ttpk_lo_t; -typedef skc_uint skc_ttpk_hi_t; - -typedef skc_uint skc_ttxk_lo_t; -typedef skc_uint skc_ttxk_hi_t; - -typedef skc_uint skc_ttck_lo_t; -typedef skc_uint skc_ttck_hi_t; - -typedef skc_uint2 skc_ttck_t; - -typedef skc_int skc_ttxb_t; - -// -// TTCK (32-BIT COMPARE) v1: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 18 | 7 | 7 | -// -// -// TTCK (32-BIT COMPARE) v2: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 15 | 9 | 8 | -// -// -// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 27 | 1 | 1 | 18 | 9 | 8 | -// - -static -skc_uint -skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) -{ - return a & SKC_TTCK_LO_MASK_ID; -} - -static -skc_layer_id -skc_ttck_get_layer(skc_ttck_t const a) -{ - // - // FIXME -- a union with a ulong and a shift down and mask is - // probably faster on some architectures - // - skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); - skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; - - return lo | hi; -} - -static -skc_uint -skc_ttck_hi_get_x(skc_ttck_hi_t const a) -{ - return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); -} - -static -skc_uint -skc_ttck_hi_get_y(skc_ttck_hi_t const a) -{ - return a >> SKC_TTCK_HI_OFFSET_Y; -} - -static -skc_bool -skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) -{ - skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); - skc_uint const hi = (a.hi ^ b.hi); - - return (lo | hi) == 0; -} - -static -skc_bool -skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) -{ - return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; -} - -static -skc_bool -skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) -{ - return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; -} - -// -// TILE TRACE SUBPIXEL -// -// The subpixels are encoded with either absolute tile coordinates -// (32-bits) or packed in delta-encoded form form. -// -// For 32-bit subpixel packing of a 32x32 tile: -// -// A tile X is encoded as: -// -// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. -// -// SX : 6 : unsigned subpixel span from min to max x with range -// [0,32]. The original direction is not captured. Would -// be nice to capture dx but not necessary right now but -// could be in the future. <--- SPARE VALUES AVAILABLE -// -// A tile Y is encoded as: -// -// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. -// -// DY : 6 : signed subpixel delta y1-y0. The range of delta is -// [-32,32] but horizontal lines are not encoded so [1,32] -// is mapped to [0,31]. The resulting range [-32,31] fits -// in 6 bits. -// -// TTS: -// -// 0 31 -// | TX | SX | TY | DY | -// +-----+------+-----+------+ -// | 10 | 6 | 10 | 6 | -// - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) -{ - // - // extract the whole pixel y coordinate - // - return SKC_BFE(a, - SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, - SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); -} - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) -{ - // - // get the linear array tile index of the pixel - // - return (((a & SKC_TTS_MASK_TX_PIXEL) - -#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) - >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) -#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) - << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) -#endif - - ) | skc_tts_get_ty_pixel_v(a)); -} - -#if 0 -static -skc_ttx_v_s32_t -skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) -{ - skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; - - return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); -} -#else -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) -{ - SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; - - return dy - (~a >> 31); -} -#endif - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) -{ - return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); -} - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) -{ - return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); -} - -// -// -// - -static -void -skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) -{ - // - // SIMD / CPU - // - // & - // - // SIMT / GPU - // - // Note that atomic_init() is likely implemented as a simple - // assignment so there is no identifiable performance difference on - // current targets. - // - // If such an architecture appears in the future then we'll probably - // still want to implement this zero'ing operation as below but - // follow with an appropriate fence that occurs before any scatter - // operations. - // - // The baroque expansion below improves performance on Intel GEN by, - // presumably, achieving the 64-byte per clock SLM write as well as - // minimizing the overall number of SEND() block initializations and - // launches. - // - // Intel GENx has a documented 64 byte per cycle SLM write limit. - // So having each lane in an 8 lane subgroup zero-write 8 bytes is - // probably a safe bet (Later: benchmarking backs this up!). - // - // Note there is no reason at this time to unroll this loop. - // - for (uint ii=0; iiwide.area[ii][skc_subgroup_lane()] = ( 0 ); -} - -// -// Note this is going to be vectorizable on most architectures. -// -// The return of the key translation feature might complicate things. -// - -static -void -skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, - __local union skc_subgroup_smem * SKC_RESTRICT const smem, - skc_block_id_t const pb_id) -{ - skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); - -#if ( SKC_TILE_RATIO == 1 ) - - SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; - -#elif ( SKC_TILE_RATIO == 2 ) - - SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); - -#else - -#error("tile ratio greater than 2 not supported") - -#endif - - // - // Note there is no need to use an atomic for this operation on the - // current group of target platforms... but this may change if - // atomic ops truly go through a different path. - // - // As noted above, this direct increment is probably faster and can - // always be followed by a fence. - // - // Furthermore, note that the key sorting orders all ttck keys - // before ttpk keys. - // - - // - // FIXME -- if the SMEM store is wider than bank word count then we - // might want to odd-even interleave the TTP values if the target - // device can't handle 64-bit stores - // - - // - // skipping per-key translation for now - // - smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); -} - -// -// Note that skc_scatter_ttsb is *not* vectorizable unless the -// architecture supports a "scatter-add" capability. All relevant -// GPUs support atomic add on shared/local memory and thus support -// scatter-add. -// - -static -void -skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, - __local union skc_subgroup_smem * SKC_RESTRICT const smem, - skc_block_id_t const sb_id) -{ - skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - - SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; - - // - // Skipping per-key translation for now - // - - // Index into tile - // - // The tiles are stored in column-major / height-major order - // - // The final column is a guard column that is OK to write to but - // will never be read. It simplifies the TTSB scatter but could be - // predicated if SMEM is really at a premium. - // - - SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); - -#if 0 - if (tts_v != SKC_TTS_INVALID) - printf("(%08X) = %u\n",tts_v,xy_idx); -#endif - - // - // adjust subpixel range to max y - // - // range is stored as [-32,31] and when read [0,31] is mapped to - // [1,32] because a dy of 0 is not possible. - // - // more succinctly: if dy >= 0 then ++dy - // - SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); - - // - // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? - // - - // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" - SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); - - // Calculate left and right coverage contribution trapezoids - SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; - SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; - - // - // Accumulate altitudes and areas - // - // Optimization: if the device supports an CPU/SIMD vector-add or - // GPU/SIMT scatter-add atomic int2 add operation then placing the - // ALT and AREA values side-by-side would halve the number of - // additions. - // -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - // - // CPU/SIMD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ - smem->aN.area[ xy_idx C] += right C; \ - } - -#else - // - // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ - SKC_TILE_HEIGHT + xy_idx C, \ - left C); \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ - right C); \ - } -#endif - - SKC_RENDER_TTSB_EXPAND(); -} - -// -// Note that 2048.0 can be represented exactly with fp16... fortuitous! -// - -#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) -#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) -#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) -#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) - -// -// -// - -static -void -skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - union skc_tile_cover * SKC_RESTRICT const cover, - union skc_tile_color * SKC_RESTRICT const color) -{ - SKC_RENDER_ACC_COVER_INT area = 0; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 - for (uint ii=0; iivN.area[ii][skc_subgroup_lane()]; - SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); - SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); - - cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); - } -} - -static -void -skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - union skc_tile_cover * SKC_RESTRICT const cover, - union skc_tile_color * SKC_RESTRICT const color) -{ - SKC_RENDER_ACC_COVER_INT area = 0; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 - for (uint ii=0; iivN.area[ii][skc_subgroup_lane()]; - SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); - SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); - - cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; - } -} - -// -// -// - -static -void -skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color) -{ - // - // rgba = solid fill - // - __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; - - *cmd_next += 2; - -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; iiaN.rgba[ii].r = rg.lo; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; iiaN.rgba[ii].g = rg.hi; - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; iiaN.rgba[ii].b = ba.lo; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; iiaN.rgba[ii].a = ba.hi; - -#else - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - SKC_RENDER_TILE_COLOR const r = rg.lo; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); - - SKC_RENDER_TILE_COLOR const g = rg.hi; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - SKC_RENDER_TILE_COLOR const b = ba.lo; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); - - SKC_RENDER_TILE_COLOR const a = ba.hi; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); - -#endif -} - -// -// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" -// -// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ -// -// Lerp in two fma/mad ops: -// -// t * b + ((-t) * a + a) -// -// Note: OpenCL documents mix() as being implemented as: -// -// a + (b - a) * t -// -// But this may be a native instruction on some devices. For example, -// on GEN9 there is an LRP "linear interoplation" function but it -// doesn't appear to support half floats. -// - -#if 1 -#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) -#else -#define SKC_LERP(a,b,t) mix(a,b,t) -#endif - -// -// CPUs have a mock local address space so copying the gradient header -// is probably not useful. Just read directly from global. -// - -#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL -#define SKC_RENDER_GRADIENT_SPACE __local -#else -#define SKC_RENDER_GRADIENT_SPACE __global -#endif - -// -// gradient is non-vertical -// -// removed the vertical (actually, horizontal) special case -// - -static -void -skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - __global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // Where is this tile? - // - // Note that the gradient is being sampled from pixel centers. - // - SKC_RENDER_GRADIENT_FLOAT const y = -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P - (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + - (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); - - float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); - - // - // Get starting numerator and denominator - // - // Note: if gh[0].dx is exactly 0.0f then this is a vertical - // gradient and can be handled by a special opcode. - // - // Note: the mad() ordering is slightly different than the original - // CUDA implementation. - // - union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; - - *cmd_next += 4; - - float const gv_x_dot = mad(x,gv.dx,gv.p0); - SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); - - // - // Where are columns along gradient vector? - // - // TODO: Note that the gv_denom isn't multiplied through. - // - // Please doublecheck this... but I recall that in certain cases - // this wipes out some precision and results in minor but noticeable - // gradient artifacts. - // - // All arguments are scalars except gv_numer so a simpler - // evaluation might save some flops. - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; - - // - // is gradient non-repeating, repeating or reflecting? - // - switch (commands[(*cmd_next)++].u32) - { - case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); - break; - - case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].distance -= floor(color->grad[ii].distance); - break; - - default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING - // - // OPTIMIZATION: Can this be done in fewer than ~4 ops? - // - // Note: OpenCL "rint()" is round-to-nearest-even integer! - // - // Note: the floor() "round to -inf" op is implemented in the - // GEN op 'FRC' so probably don't use trunc() when floor will - // suffice. - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].distance); - color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); - } - } - - // - // initialize "stoplerp" for all columns - // - uint const slope_count = commands[(*cmd_next)++].u32; - uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME - - { - float const slope = commands[(*cmd_next)++].f32; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].stoplerp = color->grad[ii].distance * slope; - } - - // - // compute stoplerp for remaining stops - // - for (int jj=1; jjgrad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); - } - - // - // copy gradient colors to local memory - // - uint const gd_n = slope_count + 1; - -#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL - // - // copy entire gradient descriptor to local memory - // - for (uint ii=skc_subgroup_lane(); iicmds[ii].u32 = commands[*cmd_next + ii].u32; - - __local half const * const SKC_RESTRICT gc = smem->gc + 0; -#else - // - // prefetch entire gradient header - // - // no noticeable impact on performance - // - // prefetch(&commands[*cmd_next].u32,gh_words); - // - __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; -#endif - - // - // adjust cmd_next so that V1 structure is consumed -- FIXME - // - *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); - - // - // lerp between color pair stops - // - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iigrad[ii].stoplerp); - SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); - - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); - } - } -} - -// -// -// - -static -void -skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // fralunco = cover.wip * acc.a - // - // acc.r = fralunco * wip.r + acc.r - // acc.g = fralunco * wip.g + acc.g - // acc.b = fralunco * wip.b + acc.b - // acc.a = -fralunco * wip.a + acc.a - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] * color_acc->aN.rgba[ii].a; - - color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // cover_min = min(cover.wip,a.acc) - // - // r.acc = cover_min * r.wip + r.acc - // g.acc = cover_min * g.wip + g.acc - // b.acc = cover_min * b.wip + b.acc - // a.acc = -cover_min * a.wip + a.acc - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii],color_acc->aN.rgba[ii].a); - - color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // r.acc = (cover.wip * r.wip) * r.acc - // g.acc = (cover.wip * g.wip) * g.acc - // b.acc = (cover.wip * b.wip) * b.acc - // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; - color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; - color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; - color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; - } -} - -// -// -// - -static -void -skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, - union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // cover.wip.contrib = (1.0 - cover.acc) * cover.wip - // cover.acc = cover.acc + cover.wip.contrib - // - // r.acc = cover.wip.contrib * r.wip + r.acc - // g.acc = cover.wip.contrib * g.wip + g.acc - // b.acc = cover.wip.contrib * b.wip + b.acc - // a.acc = -cover.wip.contrib * a.wip * a.acc - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii]) * cover_wip->aN.c[ii]; - - cover_acc->aN.c[ii] += contrib; - - color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, - union skc_tile_cover const * SKC_RESTRICT const cover_wip) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = cover_wip->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = cover_wip->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, - union skc_tile_cover const * SKC_RESTRICT const cover_acc) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = cover_acc->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) - for (uint ii=0; iivN.c[ii] = cover_acc->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip) -{ - // - // cover.wip.contrib = (1.0 - cover.acc) * cover.wip - // cover.acc = cover.acc + cover.wip.contrib - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); -} - -// -// -// - -static -void -skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, - union skc_tile_cover const * SKC_RESTRICT const cover_msk) -{ - // - // cover.wip *= cover.msk - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] *= cover_msk->aN.c[ii]; -} - -// -// -// - -static -void -skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = 0; - -#endif -} - -static -void -skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = 0; - -#endif -} - -static -void -skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = 0; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = 1; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.c[ii] = 1 - cover->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; iivN.c[ii] = 1 - cover->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) -{ -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].r = 0; - color->aN.rgba[ii].g = 0; - color->aN.rgba[ii].b = 0; - color->aN.rgba[ii].a = 1; - } - -#else - // - // DISABLED ON GEN9 -- probably a compiler bug - // - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.odd = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.odd = 1; -#endif -} - -static -void -skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) -{ -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].r = 0; - color->aN.rgba[ii].g = 0; - color->aN.rgba[ii].b = 0; - color->aN.rgba[ii].a = 1; - } - -#else - // - // DISABLED ON GEN9 -- probably a compiler bug - // - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].even.odd = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; iivN.rgba[ii].odd.odd = 1; -#endif -} - -// -// -// - -static -bool -skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) -{ - // - // returns true if tile is opaque - // - // various hacks to test for complete tile opacity - // - // note that front-to-back currently has alpha at 0.0f -- this can - // be harmonized to use a traditional alpha if we want to support - // rendering in either direction - // - // hack -- ADD/MAX/OR all alphas together and test for non-zero - // - SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=1; iiaN.rgba[ii].a; - -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - return !any(t != ( 0 )); - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) - // - // SIMT - scalar per lane - // - return !sub_group_any(t != 0); - -#else - // - // SIMT - vector per lane - // - return !sub_group_any(any(t != ( 0 ))); - -#endif - - // - // TODO: The alternative vector-per-lane implementation below is - // *not* believed to be performant because the terse vector-wide - // test is just hiding a series of comparisons and is likely worse - // than the blind ADD/MAX/OR'ing of all alphas followed by a single - // test. - // -#if 0 - // - // SIMT - vector per lane - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) - for (uint ii=0; iivN.ba[ii].a != ( 0 )))) - return false; - } - - return true; -#endif -} - -// -// -// - -static -void -skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color) -{ - // - // acc.r = acc.a * r + acc.r - // acc.g = acc.a * g + acc.g - // acc.b = acc.a * b + acc.b - // - __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; - - *cmd_next += 2; - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); -} - -// -// -// - -// #define SKC_SURFACE_IS_BUFFER -#ifdef SKC_SURFACE_IS_BUFFER - -static -void -skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, - skc_uint const surface_pitch, - union skc_tile_color const * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // NEW MAJOR OPTIMIZATION: - // - // Rotating and rasterizing the original world transform by -90 - // degrees and then rendering the scene scene by +90 degrees enables - // all the final surface composite to be perfomed in perfectly - // coalesced wide transactions. - // - // For this reason, linear access to the framebuffer is preferred. - // - // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv - // - // NOTE THIS IS TRANSPOSED BY 90 DEGREES - // - // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE - // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. - // - // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS - // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS - // - // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL - // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER - // - uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; - uint const x = skc_ttck_hi_get_x(ttck_hi); - uint const y = skc_ttck_hi_get_y(ttck_hi) ; - uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiaN.rgba[ii].r * 255); - rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; - rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; - - surface[base + ii * pitch] = rgba; - - // printf("%08v2X\n",rgba); - } -} - -#else - -static -void -skc_surface_composite_u8_rgba(__write_only image2d_t surface, - union skc_tile_color const * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // NEW MAJOR OPTIMIZATION: - // - // Rotating and rasterizing the original world transform by -90 - // degrees and then rendering the scene scene by +90 degrees enables - // all the final surface composite to be perfomed in perfectly - // coalesced wide transactions. - // - // For this reason, linear access to the framebuffer is preferred. - // - // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv - // - // NOTE THIS IS TRANSPOSED BY 90 DEGREES - // - // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE - // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. - // - // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS - // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS - // - // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL - // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER - // - -#if 1 - int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; - int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiiN.rgba[ii] A); \ - } - -#else - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_COLOR const rgba = \ - (SKC_RENDER_SURFACE_COLOR) \ - (color->aN.rgba[ii].r C, \ - color->aN.rgba[ii].g C, \ - color->aN.rgba[ii].b C, \ - 1.0); \ - SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ - } - -#endif - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - x += 1; - } -#else - int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); - int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; iiiN.rgba[ii] A); \ - } - -#else - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_COLOR const rgba = \ - (SKC_RENDER_SURFACE_COLOR) \ - (color->aN.rgba[ii].r C, \ - color->aN.rgba[ii].g C, \ - color->aN.rgba[ii].b C, \ - 1.0); \ - SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ - } - -#endif - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - } - -#endif -} - -#endif - -// -// -// -static -uint const -skc_ttck_lane(uint const ttck_idx) -{ - return ttck_idx & SKC_RENDER_SUBGROUP_MASK; -} - -// -// RENDER KERNEL -// - -__kernel -SKC_RENDER_KERNEL_ATTRIBS -void -skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, - __global struct skc_group_node const * SKC_RESTRICT const groups, - __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename - - __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys - skc_uint const ttck_count, // rename: key_count - - __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets - skc_uint const tile_count, // rename: offset_count - - __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, -#ifdef SKC_SURFACE_IS_BUFFER - __global void * SKC_RESTRICT const surface, -#else - __write_only image2d_t surface, -#endif -#ifdef SKC_SURFACE_IS_BUFFER - skc_uint const surface_pitch, -#endif - uint4 const tile_clip) // rename: clip -{ - // - // Each subgroup is responsible for a tile. No extra subgroups are - // launched. - // - // FIXME -- might be better implemented as a "grid stride loop" if - // Intel GEN really has a local memory "quantum" of 4KB which means - // we would need to launch 4 subgroups per workgroup. - // - // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. - // - - // - // declare tile cover and color registers - // - // this used to be a neat unified struct but the Intel GEN compiler - // wasn't cooperating and spilling to private memory even though all - // registers were indexed by constants - // - union skc_tile_color color_wip; - union skc_tile_color color_acc; - - union skc_tile_cover cover_wip; - union skc_tile_cover cover_acc; - union skc_tile_cover cover_msk; - - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) - // as a uniform but the alternative calculation used when there are - // multiple subgroups per workgroup is not cooperating and - // driving spillage elsewhere. - // -#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const ttck_offset_idx = get_group_id(0); -#else - skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // load the starting ttck for this offset and get a bound on the max - // number of keys that might be loaded - // - // these are uniform across all subgroup lanes - // - skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; - - // - // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) - // vector of ttck keys - // -#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - - skc_ttck_t ttck = ttck_keys[ttck_idx]; - -#else - - uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; - uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; - skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] - -#endif - - // - // set up style group/layer state - // - struct skc_styling_group { - union skc_group_range range; - skc_uint depth; - skc_uint id; - } group; - - group.range.lo = 0; - group.range.hi = SKC_UINT_MAX; - group.depth = 0; - group.id = SKC_UINT_MAX; - - // - // start with clear tile opacity, knockout and flag bits - // - // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 - // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 - // - skc_uint flags = 0; - - // - // declare and initialize accumulators - // -#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) - __local union skc_subgroup_smem smem[1]; -#else - __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; - __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); -#endif - -#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - // - // select the initial ttck key - // - skc_ttck_t ttck; -#if 0 - ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN -#else - ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND - ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); -#endif - -#endif - - // - // save the first key so we know what tile we're in - // - skc_ttck_t ttck0 = ttck; - - // - // evaluate the coarse clip as late as possible - // - skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); - - if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) - return; - - skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); - - if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) - return; - -#if 0 - printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); -#endif - - // - // load -> scatter -> flush - // - while (true) - { - // if scattering is disabled then just run through ttck keys - bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; - - // need to clear accumulators before a scatter loop - if (is_scatter_enabled) - { - skc_tile_aa_zero(smem); - } - - do { - // skip scattering? - if (is_scatter_enabled) - { - skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); - - if (skc_ttck_lo_is_prefix(ttck.lo)) { - skc_scatter_ttpb(ttxb_extent,smem,xb_id); - } else { - skc_scatter_ttsb(ttxb_extent,smem,xb_id); - } - } - - // - // any ttck keys left? - // - if (++ttck_idx >= ttck_count) - { - flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; - break; - } - - // - // process next ttck key - // -#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - // - // SIMD -- read next key - // - ttck = ttck_keys[ttck_idx]; -#else - // - // SIMT -- refresh the ttck_s? - // - uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; - - if (ttck_lane_next == 0) - ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; - - // - // broadcast next key to entire subgroup - // -#if 0 - ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN -#else - ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND - ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); -#endif -#endif - // continue scattering if on same YXL layer - } while (skc_ttck_equal_yxl(ttck0,ttck)); - - // finalize if no longer on same YX tile - if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) - { - // otherwise, unwind the tile styling and exit - flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; - } - - // - // given: new layer id from ttxk key - // - // load [layer id]{ group id, depth } - // - // if within current group's layer range - // - // if at same depth - // - // load and execute cover>[mask>]color>blend commands - // - // else if not at same depth then move deeper - // - // for all groups in group trail from cur depth to new depth - // enter group, saving and initializing regs as necessary - // increment depth and update layer range - // load and execute cover>[mask>]color>blend commands - // - // else not within layer range - // - // exit current group, restoring regs as necessary - // decrement depth and update layer range - // - // - skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi - union skc_layer_node const layer_node_new = layers[layer_id_new]; - - // clear flag that controls group/layer traversal - flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; - - do { - bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; - - // - // is layer a child of the current parent group? - // - uint cmd_next = 0; - - if (!unwind && (layer_node_new.parent == group.id)) - { - // execute this layer's cmds - cmd_next = layer_node_new.cmds; - - // if this is final then configure so groups get unwound, otherwise we're done - flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE); - } - else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) - { - // - // is layer in a child group? - // - union skc_group_parents const gp = groups[layer_node_new.parent].parents; - uint const gn = gp.depth - ++group.depth; - - if (gn == 0) - group.id = layer_node_new.parent; - else - group.id = commands[gp.base + gn - 1].parent; - - // update group layer range - group.range = groups[group.id].range; - - // enter current group - cmd_next = groups[group.id].cmds.enter; - } - else // otherwise, exit this group - { - // enter current group - cmd_next = groups[group.id].cmds.leave; - - // decrement group depth - if (--group.depth == 0) - { - flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; - } - else - { - // get path_base of current group - uint const gnpb = groups[group.id].parents.base; - - // get parent of current group - group.id = commands[gnpb].parent; - - // update group layer range - group.range = groups[group.id].range; - } - } - - // - // execute cmds - // - while (true) - { - union skc_styling_cmd const cmd = commands[cmd_next++]; - - switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) - { - case SKC_STYLING_OPCODE_NOOP: - break; - - case SKC_STYLING_OPCODE_COVER_NONZERO: - skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_EVENODD: - skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACCUMULATE: - skc_tile_cover_accumulate(&cover_acc,&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_MASK: - skc_tile_cover_wip_mask(&cover_wip,&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_WIP_ZERO: - skc_tile_cover_wip_zero(&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACC_ZERO: - skc_tile_cover_acc_zero(&cover_acc); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_ZERO: - skc_tile_cover_msk_zero(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_ONE: - skc_tile_cover_msk_one(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_INVERT: - skc_tile_cover_msk_invert(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: - skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); - break; - - case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: - // - // FIXME -- gradients shouldn't be executing so much - // conditional driven code at runtime since we *know* - // the gradient style on the host can just create a - // new styling command to exploit this. - // - // FIXME -- it might be time to try using the GPU's - // sampler on a linear array of half4 vectors -- it - // might outperform the explicit load/lerp routines. - // - // FIXME -- optimizing for vertical gradients (uhhh, - // they're actually horizontal due to the -90 degree - // view transform) is nice but is it worthwhile to - // have this in the kernel? Easy to add it back... - // -#if defined( SKC_ARCH_GEN9 ) - // disable gradients due to exessive spillage -- fix later - cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); -#else - skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); -#endif - break; - - case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: - skc_tile_color_wip_zero(&color_wip); - break; - - case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: - skc_tile_color_acc_zero(&color_acc); - break; - - case SKC_STYLING_OPCODE_BLEND_OVER: - skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_PLUS: - skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_MULTIPLY: - skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: - skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: - // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: - // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); - break; - - case SKC_STYLING_OPCODE_BACKGROUND_OVER: - skc_tile_background_over(commands,&cmd_next,&color_acc); - break; - - case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: -#ifdef SKC_SURFACE_IS_BUFFER - skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); -#else - skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); -#endif - break; - - case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: - if (skc_tile_color_test_opacity(&color_acc)) - flags |= SKC_TILE_FLAGS_SCATTER_SKIP; - break; - - default: - return; // this is an illegal opcode -- trap and die! - } - - // - // if sign bit is set then this was final command - // - if (cmd.s32 < 0) - break; - } - - // continue as long as tile flush isn't complete - } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); - - // return if was the final flush - if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) - return; - - // update wip ttck_hi - ttck0 = ttck; - } -} - -// -// -// diff --git a/src/compute/skc/runtime_cl.c b/src/compute/skc/runtime_cl.c deleted file mode 100644 index a745ed013e..0000000000 --- a/src/compute/skc/runtime_cl.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include -#include -#include - -// -// -// - -#include "runtime_cl.h" -#include "common/cl/assert_cl.h" - -// -// -// - -static is_verbose = true; - -// -// FIXME -- all variable length device queries need to start querying -// the parameter's return size before getting its value -// -// FIXME -- this is now handled by the common/cl/find.* routine -// - -union skc_cl_device_version { - struct { - cl_uchar opencl_space[7]; // "OpenCL_" - cl_uchar major; - cl_uchar dot; - cl_uchar minor; -#if 1 // Intel NEO requires at least 16 bytes - cl_uchar space; - cl_uchar vendor[32]; -#endif - }; - struct { - cl_uchar aN[]; - }; -}; - -typedef cl_bitfield cl_diagnostic_verbose_level_intel; - -#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4 - -static -void -CL_CALLBACK -skc_context_callback(char const * error, void const * info, size_t size, void * user) -{ - if (info != NULL ) - { - fprintf(stderr,"%s\n",error); - } -} - -// -// -// - -skc_err -skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]) -{ - skc_err err = SKC_ERR_SUCCESS; - - // - // search available devices for a match - // -#define PLATFORM_IDS_MAX 16 -#define DEVICE_IDS_MAX 16 -#define PLATFORM_NAME_SIZE_MAX 64 -#define DEVICE_NAME_SIZE_MAX 64 -#define DRIVER_VERSION_SIZE_MAX 64 - - cl_int cl_err; - - cl_platform_id platform_ids[PLATFORM_IDS_MAX]; - cl_device_id device_ids [PLATFORM_IDS_MAX][DEVICE_IDS_MAX]; - - cl_uint platform_count; - cl_uint device_count[PLATFORM_IDS_MAX]; - - cl_uint platform_idx = UINT32_MAX, device_idx = UINT32_MAX; - - bool match = false; // find _first_ match - - // - // get number of platforms - // - cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count)); - - // - // search platforms - // - for (cl_uint ii=0; iiversion.major = device_version.major - 48; - runtime_cl->version.minor = device_version.minor - 48; - runtime_cl->base_align = base_align; - - if (is_verbose) { - fprintf(stdout," >>>"); - } - } - else if (is_verbose) - { - fprintf(stdout," "); - } - - if (is_verbose) { - fprintf(stdout, - " %1u: %s [ %s ] [ %s ] [ %u ]\n", - jj, - device_name, - device_version.aN, - driver_version, - base_align); - } - } - } - - if (is_verbose) { - fprintf(stdout,"\n"); - } - - // - // get target platform and device - // - if (platform_idx >= platform_count) - { - fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring); - exit(EXIT_FAILURE); - } - if (device_idx >= device_count[platform_idx]) - { - fprintf(stderr,"no match for target device substring %s\n",target_device_substring); - exit(EXIT_FAILURE); - } - - runtime_cl->platform_id = platform_ids[platform_idx]; - runtime_cl->device_id = device_ids [platform_idx][device_idx]; - - // - // create context - // - -#if 0 - cl_context_properties context_properties[] = - { - CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id, - 0 - }; -#else - context_properties[1] = (cl_context_properties)runtime_cl->platform_id; -#endif - - runtime_cl->context = clCreateContext(context_properties, - 1, - &runtime_cl->device_id, - skc_context_callback, - NULL, - &cl_err); - cl_ok(cl_err); - - // - // get device name, driver version, and unified memory flag - // - if (is_verbose) - { - char device_name[DEVICE_NAME_SIZE_MAX]; - char driver_version[DRIVER_VERSION_SIZE_MAX]; - cl_bool device_is_unified; - cl_device_svm_capabilities svm_caps; - size_t printf_buffer_size; - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_NAME, - sizeof(device_name), - device_name, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DRIVER_VERSION, - sizeof(driver_version), - driver_version, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_HOST_UNIFIED_MEMORY, - sizeof(device_is_unified), - &device_is_unified, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_SVM_CAPABILITIES, - sizeof(svm_caps), - &svm_caps, - 0)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_PRINTF_BUFFER_SIZE, - sizeof(printf_buffer_size), - &printf_buffer_size, - NULL)); - - fprintf(stderr, - "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER %c\n" - "CL_DEVICE_SVM_FINE_GRAIN_BUFFER %c\n" - "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM %c\n" - "CL_DEVICE_SVM_ATOMICS %c\n" - "CL_DEVICE_PRINTF_BUFFER_SIZE %zu\n\n", - svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-', - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? '*' : '-', - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? '*' : '-', - svm_caps & CL_DEVICE_SVM_ATOMICS ? '*' : '-', - printf_buffer_size); - } - - return err; -} - -// -// -// - -skc_err -skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl) -{ - // FIXME - printf("%s incomplete!\n",__func__); - - return SKC_ERR_SUCCESS; -} - -// -// -// - -cl_command_queue -skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type) -{ - cl_command_queue cq; - - if (runtime_cl->version.major < 2) - { - // - // <= OpenCL 1.2 - // - cl_int cl_err; - - cq = clCreateCommandQueue(runtime_cl->context, - runtime_cl->device_id, - (cl_command_queue_properties)type, - &cl_err); cl_ok(cl_err); - } - else - { - // - // >= OpenCL 2.0 - // - cl_int cl_err; - cl_queue_properties const queue_properties[] = { - CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0 - }; - - cq = clCreateCommandQueueWithProperties(runtime_cl->context, - runtime_cl->device_id, - queue_properties, - &cl_err); cl_ok(cl_err); - } - - return cq; -} - -// -// -// - diff --git a/src/compute/skc/runtime_cl.h b/src/compute/skc/runtime_cl.h deleted file mode 100644 index 9e58ca0cc7..0000000000 --- a/src/compute/skc/runtime_cl.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// squelch OpenCL 1.2 deprecation warning -// - -#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif - -#include - -// -// -// - -#include "skc.h" - -// -// Minimal OpenCL state needed by the runtime to get started -// - -struct skc_runtime_cl -{ - cl_platform_id platform_id; - cl_device_id device_id; - cl_context context; - - struct { - cl_uint major; - cl_uint minor; - } version; // sometimes we need to know this at runtime - - cl_uint base_align; // base address alignment for subbuffer origins -}; - -// -// -// - -typedef enum skc_cq_type_e { - SKC_CQ_TYPE_IN_ORDER = 0, - SKC_CQ_TYPE_OUT_OF_ORDER = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - SKC_CQ_TYPE_IN_ORDER_PROFILING = (SKC_CQ_TYPE_IN_ORDER | CL_QUEUE_PROFILING_ENABLE), - SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE), -} skc_cq_type_e; - -// -// safely creates a generic OpenCL target in very few lines -// - -skc_err -skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]); - -skc_err -skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl); - -// -// create a command queue with the non-deprecated function -// - -cl_command_queue -skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type); - -// -// -// - diff --git a/src/compute/skc/runtime_cl_12.c b/src/compute/skc/runtime_cl_12.c deleted file mode 100644 index fca13edbbd..0000000000 --- a/src/compute/skc/runtime_cl_12.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include -#include -#include -#include - -// -// -// - -#include "context.h" -#include "block.h" -#include "grid.h" -#include "common/cl/assert_cl.h" -#include "config_cl.h" -#include "runtime_cl.h" -#include "runtime_cl_12.h" -#include "export_cl_12.h" - -// -// -// - -static -void -skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq) -{ - // save size - runtime->block_pool.size = &runtime->config->block_pool; - - // create block extent - skc_extent_pdrw_alloc(runtime, - &runtime->block_pool.blocks, - runtime->block_pool.size->pool_size * - runtime->config->block.bytes); - - // allocate block pool ids - skc_extent_pdrw_alloc(runtime, - &runtime->block_pool.ids, - runtime->block_pool.size->ring_pow2 * sizeof(skc_uint)); - - // allocate block pool atomics - skc_extent_phr_pdrw_alloc(runtime, - &runtime->block_pool.atomics, - sizeof(union skc_block_pool_atomic)); - - // acquire pool id and atomic initialization kernels - cl_kernel k0 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS); - cl_kernel k1 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS); - - // init ids - cl(SetKernelArg(k0,0,sizeof(runtime->block_pool.ids.drw),&runtime->block_pool.ids.drw)); - cl(SetKernelArg(k0,1,SKC_CL_ARG(runtime->block_pool.size->pool_size))); - - // the kernel grid is shaped by the target device -- always 2 for atomics - skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS, - cq,k0,runtime->block_pool.size->pool_size, - 0,NULL,NULL); - - // init atomics - cl(SetKernelArg(k1,0,sizeof(runtime->block_pool.atomics.drw),&runtime->block_pool.atomics.drw)); - cl(SetKernelArg(k1,1,SKC_CL_ARG(runtime->block_pool.size->pool_size))); - - // the kernel grid is shaped by the target device - skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS, - cq,k1,2, - 0,NULL,NULL); - - // kickstart kernel execution - cl(Flush(cq)); - - // release kernels - cl(ReleaseKernel(k0)); - cl(ReleaseKernel(k1)); -} - -static -void -skc_block_pool_dispose(struct skc_runtime * const runtime) -{ - skc_extent_phr_pdrw_free(runtime,&runtime->block_pool.atomics); - skc_extent_pdrw_free (runtime,&runtime->block_pool.ids); - skc_extent_pdrw_free (runtime,&runtime->block_pool.blocks); -} - -// -// -// - -static -bool -skc_runtime_yield(struct skc_runtime * const runtime) -{ - return skc_scheduler_yield(runtime->scheduler); -} - -static -void -skc_runtime_wait(struct skc_runtime * const runtime) -{ - skc_scheduler_wait(runtime->scheduler); -} - -// -// -// - -skc_err -skc_runtime_cl_12_create(struct skc_context * const context, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]) -{ - // allocate the runtime - struct skc_runtime * const runtime = malloc(sizeof(*runtime)); - - // acquire OpenCL ids and context for target device - skc_err err = skc_runtime_cl_create(&runtime->cl, - target_platform_substring, - target_device_substring, - context_properties); - - // create device - skc_device_create(runtime); - - // create the host and device allocators - skc_allocator_host_create(runtime); - skc_allocator_device_create(runtime); - - // how many slots in the scheduler? - runtime->scheduler = skc_scheduler_create(runtime,runtime->config->scheduler.size); - - // allocate deps structure - runtime->deps = skc_grid_deps_create(runtime, - runtime->scheduler, - runtime->config->block_pool.pool_size); - - // initialize cq pool - skc_cq_pool_create(runtime, - &runtime->cq_pool, - runtime->config->cq_pool.type, - runtime->config->cq_pool.size); - - // acquire in-order cq - cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); - - // initialize block pool - skc_block_pool_create(runtime,cq); - - // intialize handle pool - skc_handle_pool_create(runtime, - &runtime->handle_pool, - runtime->config->handle_pool.size, - runtime->config->handle_pool.width, - runtime->config->handle_pool.recs); - - // - // initialize pfns - // - // FIXME -- at this point we will have identified which device we've - // targeted and will load a DLL (or select from a built-in library) - // that contains all the pfns. - // - context->runtime = runtime; - - context->yield = skc_runtime_yield; - context->wait = skc_runtime_wait; - - context->path_builder = skc_path_builder_cl_12_create; - context->path_retain = skc_runtime_path_host_retain; - context->path_release = skc_runtime_path_host_release; - context->path_flush = skc_runtime_path_host_flush; - - context->raster_builder = skc_raster_builder_cl_12_create; - context->raster_retain = skc_runtime_raster_host_retain; - context->raster_release = skc_runtime_raster_host_release; - context->raster_flush = skc_runtime_raster_host_flush; - - context->composition = skc_composition_cl_12_create; - context->styling = skc_styling_cl_12_create; - - context->surface = skc_surface_cl_12_create; - - // block on pool creation - cl(Finish(cq)); - - // dispose of in-order cq - skc_runtime_release_cq_in_order(runtime,cq); - - return err; -}; - -// -// -// - -skc_err -skc_runtime_cl_12_dispose(struct skc_context * const context) -{ - // - // FIXME -- incomplete - // - fprintf(stderr,"%s incomplete!\n",__func__); - - struct skc_runtime * runtime = context->runtime; - - skc_allocator_device_dispose(runtime); - skc_allocator_host_dispose(runtime); - - skc_scheduler_dispose(context->runtime,context->runtime->scheduler); - - skc_grid_deps_dispose(context->runtime->deps); - - skc_cq_pool_dispose(runtime,&runtime->cq_pool); - - skc_block_pool_dispose(context->runtime); - - // skc_handle_pool_dispose(context->runtime); - - return SKC_ERR_SUCCESS; -} - -// -// TEMPORARY BENCHMARK -// - -#if 1 - -#include - -#define SKC_FRAMES_MASK 0x7F -#define SKC_FRAMES (SKC_FRAMES_MASK + 1) - -void -skc_runtime_cl_12_debug(struct skc_context * const context) -{ -#ifdef NDEBUG - static skc_uint frames=0; - static LARGE_INTEGER StartingTime={0}, EndingTime; - - if ((frames++ & SKC_FRAMES_MASK) != SKC_FRAMES_MASK) - return; - - QueryPerformanceCounter(&EndingTime); - - LARGE_INTEGER ElapsedMicroseconds, Frequency; - - ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; - - QueryPerformanceFrequency(&Frequency); - - double const msecs_total = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart; - double const msecs_frame = msecs_total / SKC_FRAMES; - - printf("Frames / Total / Per : %u / %.3f / %.3f\n", - SKC_FRAMES,msecs_total,msecs_frame); -#endif - - struct skc_runtime * const runtime = context->runtime; - - // acquire out-of-order cq - cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); - - // copy atomics to host - skc_extent_phr_pdrw_read(&runtime->block_pool.atomics,cq,NULL); - - // block until complete - cl(Finish(cq)); - - // dispose of out-of-order cq - skc_runtime_release_cq_in_order(runtime,cq); - - union skc_block_pool_atomic const * const bp_atomic = runtime->block_pool.atomics.hr; - - skc_uint const available = bp_atomic->writes - bp_atomic->reads; - skc_uint const inuse = runtime->config->block_pool.pool_size - available; - - fprintf(stderr,"w/r/f/a: %9u - %9u = %9u : %6.2f MB\n", - bp_atomic->writes, - bp_atomic->reads, - available, - (inuse * runtime->config->block.bytes) / (1024.0*1024.0)); - - if (available >= (1<<27)) - { - fprintf(stderr,"block pool corrupted!\n"); - exit(-1); - } - - // - // - // -#ifdef NDEBUG - QueryPerformanceCounter(&StartingTime); -#endif -} - -#endif - -// -// -// - diff --git a/src/compute/skc/runtime_cl_12.h b/src/compute/skc/runtime_cl_12.h deleted file mode 100644 index 7e7ffcb284..0000000000 --- a/src/compute/skc/runtime_cl_12.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "runtime.h" -#include "runtime_cl.h" -#include "cq_pool_cl.h" -#include "handle_pool_cl_12.h" -#include "block_pool_cl_12.h" -#include "allocator_device_cl.h" - -// -// FIXME -- two parts: -// -// 1. directly access the structures in the runtime sub-struct implementations -// 2. possibly wall off the non-platform-specific structs into a sub structure -// - -struct skc_runtime -{ - // - // state visible to device - // - struct skc_runtime_cl cl; - - struct { - struct skc_allocator_host host; - struct skc_allocator_device device; - } allocator; - - struct skc_cq_pool cq_pool; - - struct skc_block_pool block_pool; - - struct skc_handle_pool handle_pool; - - // - // state that is slightly opaque (for now) - // - struct skc_scheduler * scheduler; - - struct skc_grid_deps * deps; - - struct skc_config const * config; // FIXME: config will be determined by device with some opportunities to resize - - struct skc_device * device; // opaque bundle of kernels -}; - -// -// Creation and disposal intitializes context and may rely on other -// context resources like the scheduler -// - -skc_err -skc_runtime_cl_12_create(struct skc_context * const context, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]); - -skc_err -skc_runtime_cl_12_dispose(struct skc_context * const context); - -// -// HOST HANDLE RETAIN/RELEASE/FLUSH -// - -skc_err -skc_runtime_path_host_retain(struct skc_runtime * const runtime, - skc_path_t const * paths, - uint32_t count); - -skc_err -skc_runtime_raster_host_retain(struct skc_runtime * const runtime, - skc_raster_t const * rasters, - uint32_t count); - - -skc_err -skc_runtime_path_host_release(struct skc_runtime * const runtime, - skc_path_t const * paths, - uint32_t count); - -skc_err -skc_runtime_raster_host_release(struct skc_runtime * const runtime, - skc_raster_t const * rasters, - uint32_t count); - - -skc_err -skc_runtime_path_host_flush(struct skc_runtime * const runtime, - skc_path_t const * paths, - uint32_t count); - -skc_err -skc_runtime_raster_host_flush(struct skc_runtime * const runtime, - skc_raster_t const * rasters, - uint32_t count); - -// -// DEVICE/PIPELINE HANDLE ACQUIRE/RETAIN/RELEASE -// -// The retain operations pre-validate handles -// - -skc_handle_t -skc_runtime_handle_device_acquire(struct skc_runtime * const runtime); - -skc_err -skc_runtime_handle_device_validate_retain(struct skc_runtime * const runtime, - skc_typed_handle_type_e const handle_type, - skc_typed_handle_t const * typed_handles, - uint32_t count); - -void -skc_runtime_handle_device_retain(struct skc_runtime * const runtime, - skc_handle_t const * handles, - uint32_t count); - -void -skc_runtime_path_device_release(struct skc_runtime * const runtime, - skc_handle_t const * handles, - uint32_t count); - -void -skc_runtime_raster_device_release(struct skc_runtime * const runtime, - skc_handle_t const * handles, - uint32_t count); - -// -// We only use in-order command queues in the pipeline -// - -cl_command_queue -skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime); - -void -skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, - cl_command_queue cq); - -// -// DEVICE MEMORY ALLOCATION -// - -cl_mem -skc_runtime_device_perm_alloc(struct skc_runtime * const runtime, - cl_mem_flags const flags, - size_t const size); - -void -skc_runtime_device_perm_free(struct skc_runtime * const runtime, - cl_mem const mem); - -cl_mem -skc_runtime_device_temp_alloc(struct skc_runtime * const runtime, - cl_mem_flags const flags, - size_t const size, - skc_subbuf_id_t * const subbuf_id, - size_t * const subbuf_size); - -void -skc_runtime_device_temp_free(struct skc_runtime * const runtime, - cl_mem const mem, - skc_subbuf_id_t const subbuf_id); - -// -// -// diff --git a/src/compute/skc/segment_ttck.cl b/src/compute/skc/segment_ttck.cl deleted file mode 100644 index 6ac068bee6..0000000000 --- a/src/compute/skc/segment_ttck.cl +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE -// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS -// KERNEL WILL NEED TO BE UPDATED -// - -#include - -#include "atomic_cl.h" -#include "tile.h" - -// -// -// - -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) - -// -// -// - -#define SKC_YX_NEQ(row,prev) \ - (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) - -// -// -// - -__kernel -__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) -void -skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, - __global uint * SKC_RESTRICT const indices, - __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) -{ - uint const global_id = get_global_id(0); - uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; - uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; - - // - // LOAD ALL THE ROWS - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; - - HS_SLAB_ROWS(); - - // - // LOAD LAST REGISTER FROM COLUMN TO LEFT - // - uint diffs = 0; - uint2 r0 = r1; - - if (gmem_base > 0) { - // if this is the first key in any slab but the first then it - // broadcast loads the last key in previous slab - r0.hi = as_uint2(vout[gmem_base - 1]).hi; - } else if (get_sub_group_local_id() == 0) { - // if this is the first lane in the first slab - diffs = 1; - } - - // now shuffle in the last key from the column to the left - r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); - - // - // FIND ALL DIFFERENCES IN SLAB - // - uint valid = 0; - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - valid |= ((r##row != SKC_ULONG_MAX) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - diffs |= (SKC_YX_NEQ(row,prev) << prev); - - HS_SLAB_ROWS(); - - // - // SUM UP THE DIFFERENCES - // - uint const valid_diffs = valid & diffs; - uint const count = popcount(valid_diffs); - uint const inclusive = sub_group_scan_inclusive_add(count); - uint const exclusive = inclusive - count; - - // - // RESERVE SPACE IN THE INDICES ARRAY - // - uint next = 0; - - if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) - next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset - - // distribute base across subgroup - next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); - - // - // STORE THE INDICES - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (valid_diffs & (1 << prev)) \ - indices[next++] = lane_idx + prev; - - HS_SLAB_ROWS(); - - // - // TRANSPOSE THE SLAB AND STORE IT - // - HS_TRANSPOSE_SLAB(); -} - -// -// -// diff --git a/src/compute/skc/segment_ttrk.cl b/src/compute/skc/segment_ttrk.cl deleted file mode 100644 index 28a9557ad7..0000000000 --- a/src/compute/skc/segment_ttrk.cl +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE -// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS -// KERNEL WILL NEED TO BE UPDATED -// - -#include - -#include "tile.h" -#include "raster_builder_cl_12.h" // need meta_in structure -#include "device_cl_12_gen9.h" - -// -// -// - -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) - -// -// THE BEST TYPE TO ZERO SMEM -// - -#define SKC_ZERO_TYPE ulong -#define SKC_ZERO_WORDS 2 - -// -// THE ORDER OF COMPONENTS IS: -// -// 0: blocks -// 1: offset -// 2: pk -// 3: rk -// - -#if (HS_KEYS_PER_SLAB < 256) - -#define SKC_META_TYPE uint -#define SKC_META_WORDS 1 - -#define SKC_COMPONENT_TYPE uchar - -#else - -#define SKC_META_TYPE uint2 -#define SKC_META_WORDS 2 - -#define SKC_COMPONENT_TYPE ushort - -#endif - -// -// -// - -#if ( SKC_TTRK_HI_BITS_COHORT <= 8) -#define SKC_COHORT_TYPE uchar -#else -#define SKC_COHORT_TYPE ushort -#endif - -// -// -// - -#define SKC_COHORT_ID(row) \ - as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT - -// -// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED -// - -#define SKC_IS_BLOCK(row) \ - ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - -#define SKC_YX(row,prev) \ - (as_uint2(r##row).hi ^ as_uint2(r##prev).hi) - -#define SKC_IS_PK(row,prev) \ - ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X) - -// -// COHORT SIZE IS ALWAYS A POWER-OF-TWO -// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO -// -// COHORT SIZE >= SUBGROUP SIZE -// - -#define SKC_COHORT_SIZE (1<> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; - uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; - - // - // LOAD ALL THE ROWS - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; - - HS_SLAB_ROWS(); - - // - // LOAD LAST REGISTER FROM COLUMN TO LEFT - // - uint diffs = 0; - uint2 r0 = 0; - - if (gmem_base > 0) { - // if this is the first key in any slab but the first then it - // broadcast loads the last key in previous slab - r0.hi = as_uint2(vout[gmem_base - 1]).hi; - } else { - // otherwise broadcast the first key in the first slab - r0.hi = sub_group_broadcast(as_uint2(r1).hi,0); - // and mark it as an implicit diff - if (get_sub_group_local_id() == 0) - diffs = 1; - } - - // now shuffle in the last key from the column to the left - r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); - - // shift away y/x - SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT; - - // - // EXTRACT ALL COHORT IDS EARLY... - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row); - - HS_SLAB_ROWS(); - - // - // DEBUG - // -#if 0 - if (gmem_base == HS_KEYS_PER_SLAB * 7) - { - if (get_sub_group_local_id() == 0) - printf("\n%llX ",as_ulong(r0)); - else - printf("%llX ",as_ulong(r0)); -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (get_sub_group_local_id() == 0) \ - printf("\n%llX ",r##row); \ - else \ - printf("%llX ",r##row); - - HS_SLAB_ROWS(); - } -#endif - - // - // CAPTURE ALL CONDITIONS WE CARE ABOUT - // - // Diffs must be captured before cohorts - // - uint valid = 0; - uint blocks = 0; - uint pks = 0; - SKC_COHORT_TYPE c_max = 0; - - // - // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN - // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE - // -#if 0 - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - diffs |= ((c##row != c##prev) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - blocks |= (SKC_IS_BLOCK(row) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - pks |= SKC_IS_PK(row,prev) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - valid |= ((r##row != SKC_ULONG_MAX) << prev); - - HS_SLAB_ROWS(); - -#else - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (c##row != c##prev) \ - diffs |= 1<>HS_LANES_PER_WARP_LOG2,c_min,c_max); -#endif - - // - // ZERO SMEM - // - // zero only the meta info for the cohort ids found in this slab - // -#if (SKC_ZERO_WORDS >= SKC_META_WORDS) - uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id(); - uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO; - - for (; zz<=zz_max; zz+=HS_LANES_PER_WARP) - shared.z[zz] = 0; -#else - // ERROR -- it's highly unlikely that the zero type is smaller than - // the meta type -#error("Unsupported right now...") -#endif - - // - // ACCUMULATE AND STORE META INFO - // - uint const valid_blocks = valid & blocks; - uint const valid_pks = valid & pks & ~diffs; - SKC_META_TYPE meta = ( 0 ); - -#define SKC_META_LOCAL_ADD(meta) \ - atomic_add(shared.m+HS_REG_LAST(c),meta); - -#define SKC_META_LOCAL_STORE(meta,prev) \ - shared.m[c##prev] = meta; - - // note this is purposefully off by +1 -#define SKC_META_RESET(meta,curr) \ - meta = ((gmem_off + curr) << 8); - -#if 0 - - // FIXME -- this can be tweaked to shift directly -#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ - meta += ((((blocks >> prev) & 1) ) | \ - (((pks >> prev) & 1) << 16) | \ - (((rks >> prev) & 1) << 24)); - -#else - -#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ - if (blocks & (1<= cc_min) && (cc <= cc_max)) - { - uint const c = shared.c[cc]; - - if (c != 0) - atomic_add(metas+cc,c+adjust); - } - - cc += HS_LANES_PER_WARP; - - for (; cc<=cc_max; cc+=HS_LANES_PER_WARP) - { - uint const c = shared.c[cc]; - - if (c != 0) - atomic_add(metas+cc,c+adjust); - } -} - -// -// -// diff --git a/src/compute/skc/styling_cl_12.c b/src/compute/skc/styling_cl_12.c deleted file mode 100644 index 6c84fe6f70..0000000000 --- a/src/compute/skc/styling_cl_12.c +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTES: -// -// - this particular object only needs a command queue for a short -// time so consider acquiring/releasing the command queue on demand -// but only if command queues are cached and expensive to keep -// - -#include "common/cl/assert_cl.h" - -#include "styling_cl_12.h" -#include "extent_cl_12.h" -#include "runtime_cl_12.h" - -#include "context.h" -#include "styling_types.h" - -// -// -// - -static -void -skc_styling_unmap_complete(skc_grid_t const grid) -{ - struct skc_styling_impl * const impl = skc_grid_get_data(grid); - - impl->state = SKC_STYLING_STATE_SEALED; - - skc_grid_complete(grid); -} - -static -void -skc_styling_unmap_cb(cl_event event, cl_int status, skc_grid_t const grid) -{ - SKC_CL_CB(status); - - struct skc_styling_impl * const impl = skc_grid_get_data(grid); - struct skc_scheduler * const scheduler = impl->runtime->scheduler; - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(scheduler,skc_styling_unmap_complete,grid); -} - -static -void -skc_styling_grid_pfn_execute(skc_grid_t const grid) -{ - struct skc_styling_impl * const impl = skc_grid_get_data(grid); - struct skc_styling * const styling = impl->styling; - - // - // unmap all extents - // - cl_event complete; - - skc_extent_phwN_pdrN_unmap(&impl->layers,styling->layers.extent,impl->cq,NULL); - skc_extent_phwN_pdrN_unmap(&impl->groups,styling->groups.extent,impl->cq,NULL); - skc_extent_phwN_pdrN_unmap(&impl->extras,styling->extras.extent,impl->cq,&complete); - - // set the event - cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unmap_cb,grid)); - cl(ReleaseEvent(complete)); - - // flush command queue - cl(Flush(impl->cq)); -} - -// -// -// - -static -void -skc_styling_pfn_seal(struct skc_styling_impl * const impl) -{ - // return if sealing or sealed - if (impl->state >= SKC_STYLING_STATE_SEALING) - return; - - struct skc_runtime * const runtime = impl->runtime; - struct skc_scheduler * const scheduler = runtime->scheduler; - - // - // otherwise, wait for UNSEALING > UNSEALED transition - // - if (impl->state == SKC_STYLING_STATE_UNSEALING) - { - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); - } - - // - // we're unsealed so we need to seal and start the grid - // - impl->state = SKC_STYLING_STATE_SEALING; - impl->grid = SKC_GRID_DEPS_ATTACH(runtime->deps, - NULL, - impl, - NULL, // no waiting - skc_styling_grid_pfn_execute, - NULL); // no dispose - - // no need to force -- styling has no dependencies - skc_grid_start(impl->grid); -} - -// -// -// - -void -skc_styling_unseal_complete(struct skc_styling_impl * const impl) -{ - struct skc_runtime * const runtime = impl->runtime; - - // we're now unsealed - impl->state = SKC_STYLING_STATE_UNSEALED; -} - -static -void -skc_styling_unseal_cb(cl_event event, cl_int status, struct skc_styling_impl * const impl) -{ - SKC_CL_CB(status); - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_styling_unseal_complete,impl); -} - -static -void -skc_styling_pfn_unseal(struct skc_styling_impl * const impl, skc_bool const block) -{ - // return if already unsealed - if (impl->state == SKC_STYLING_STATE_UNSEALED) - return; - - // - // otherwise, we're going to need to pump the scheduler - // - struct skc_runtime * const runtime = impl->runtime; - struct skc_scheduler * const scheduler = runtime->scheduler; - - // - // wait for UNSEALING > UNSEALED transition - // - if (impl->state == SKC_STYLING_STATE_UNSEALING) - { - if (block) { - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); - } - return; - } - - // - // otherwise, wait for SEALING > SEALED transition ... - // - if (impl->state == SKC_STYLING_STATE_SEALING) - { - // wait if sealing - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED); - } - - // wait for rendering locks to be released - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0); - - // ... and then unseal the styling object - impl->state = SKC_STYLING_STATE_UNSEALING; - - // defensively NULL the grid reference - impl->grid = NULL; // defensive - - // set styling pointers with mapped extents - cl_event complete; - - struct skc_styling * const styling = impl->styling; - - styling->layers.extent = skc_extent_phwN_pdrN_map(&impl->layers,impl->cq,NULL); - styling->groups.extent = skc_extent_phwN_pdrN_map(&impl->groups,impl->cq,NULL); - styling->extras.extent = skc_extent_phwN_pdrN_map(&impl->extras,impl->cq,&complete); - - cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unseal_cb,impl)); - cl(ReleaseEvent(complete)); - - // flush it - cl(Flush(impl->cq)); - - // wait until unsealed... - if (block) { - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED); - } -} - -// -// -// - -static -void -skc_styling_pfn_release(struct skc_styling_impl * const impl) -{ - if (--impl->styling->ref_count != 0) - return; - - // - // otherwise, unmap all resources by sealing and delete - // - skc_styling_pfn_seal(impl); - - struct skc_runtime * const runtime = impl->runtime; - struct skc_scheduler * const scheduler = runtime->scheduler; - - // wait until sealed - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED); - - // wait for locks to drain - SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0) - - // - // styling is now disposable - // - - // free styling host - skc_runtime_host_perm_free(runtime,impl->styling); - - // release the cq - skc_runtime_release_cq_in_order(runtime,impl->cq); - - // free extents - skc_extent_phwN_pdrN_free(runtime,&impl->layers); - skc_extent_phwN_pdrN_free(runtime,&impl->groups); - skc_extent_phwN_pdrN_free(runtime,&impl->extras); - - // free styling impl - skc_runtime_host_perm_free(runtime,impl); -} - -// -// -// - -void -skc_styling_retain_and_lock(struct skc_styling * const styling) -{ - skc_styling_retain(styling); - - styling->impl->lock_count += 1; -} - -void -skc_styling_unlock_and_release(struct skc_styling * const styling) -{ - styling->impl->lock_count -= 1; - - skc_styling_pfn_release(styling->impl); -} - -// -// -// - -skc_err -skc_styling_cl_12_create(struct skc_context * const context, - struct skc_styling * * const styling, - skc_uint const layers_count, - skc_uint const groups_count, - skc_uint const extras_count) -{ - // retain the context - // skc_context_retain(context); - - // allocate the impl - struct skc_runtime * const runtime = context->runtime; - struct skc_styling_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); - - // allocate styling - (*styling) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**styling)); - (*styling)->context = context; - (*styling)->impl = impl; - - // intialize impl - impl->styling = (*styling); - impl->runtime = runtime; - - SKC_ASSERT_STATE_INIT(impl,SKC_STYLING_STATE_SEALED); - - impl->lock_count = 0; - - impl->cq = skc_runtime_acquire_cq_in_order(runtime); - - // - // The styling object is unique in that the API lets the user - // specify resource limits - // - // The styling object is a simple container that can have wildly - // varying resource requirements (but still relatively modest). - // - // Additionally, an advanced SKC programmer may want to create many - // styling and composition objects as they're relatively cheap. - // - skc_extent_phwN_pdrN_alloc(runtime,&impl->layers,sizeof(*(*styling)->layers.extent) * layers_count); - skc_extent_phwN_pdrN_alloc(runtime,&impl->groups,sizeof(*(*styling)->groups.extent) * groups_count); - skc_extent_phwN_pdrN_alloc(runtime,&impl->extras,sizeof(*(*styling)->extras.extent) * extras_count); - - // initialize styling - (*styling)->layers.size = layers_count; - (*styling)->groups.size = groups_count; - (*styling)->extras.size = extras_count; - - (*styling)->layers.count = 0; - (*styling)->groups.count = 0; - (*styling)->extras.count = 0; - - // save pfns - (*styling)->seal = skc_styling_pfn_seal; - (*styling)->unseal = skc_styling_pfn_unseal; - (*styling)->release = skc_styling_pfn_release; - - // set ref count - (*styling)->ref_count = 1; - - // map the extents by unsealing - skc_styling_pfn_unseal(impl,false); - - return SKC_ERR_SUCCESS; -} - -// -// -// diff --git a/src/compute/skc/styling_cl_12.h b/src/compute/skc/styling_cl_12.h deleted file mode 100644 index a319568ee5..0000000000 --- a/src/compute/skc/styling_cl_12.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include - -#include "styling.h" -#include "grid.h" -#include "extent_cl_12.h" -#include "assert_state.h" - -// -// styling states -// - -typedef enum skc_styling_state_e { - - SKC_STYLING_STATE_UNSEALING, - SKC_STYLING_STATE_UNSEALED, - SKC_STYLING_STATE_SEALING, - SKC_STYLING_STATE_SEALED - -} skc_styling_state_e; - -// -// IMPL -// - -struct skc_styling_impl -{ - struct skc_styling * styling; - struct skc_runtime * runtime; - - SKC_ASSERT_STATE_DECLARE(skc_styling_state_e); - - skc_int lock_count; // # of wip renders - - skc_grid_t grid; - - // in-order command queue - cl_command_queue cq; - - // - // only 3 extents - // - struct skc_extent_phwN_pdrN layers; - struct skc_extent_phwN_pdrN groups; - struct skc_extent_phwN_pdrN extras; -}; - -// -// ONLY VISIBLE WITHIN THIS RUNTIME -// - -void -skc_styling_retain_and_lock(struct skc_styling * const styling); - -void -skc_styling_unlock_and_release(struct skc_styling * const styling); - -// -// -// diff --git a/src/compute/skc/surface_cl_12.h b/src/compute/skc/surface_cl_12.h deleted file mode 100644 index 43ea5428a5..0000000000 --- a/src/compute/skc/surface_cl_12.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#ifndef SKC_SURFACE_CL_12_ONCE -#define SKC_SURFACE_CL_12_ONCE - -// -// Unlike other object platform implementations, the surface object -// implementation needs to access the opaque platform-specfic outputs -// of the composition and styling objects. -// -// Composition : { keys, offsets, key_count, offset_count } -// Styling : { layers, groups, commands } -// -// With the OpenCL platform we'll handle this by simply exposing the -// argument value (void*) and its size (size_t). -// -// TODO: It might make sense in the future to support more complex -// rendering jobs that simultaneously involve multiple surfaces, -// compositions and stylings. -// - -#endif - -// -// -// diff --git a/src/compute/skc/surface_cl_12_buffer.c b/src/compute/skc/surface_cl_12_buffer.c deleted file mode 100644 index cc7cba5225..0000000000 --- a/src/compute/skc/surface_cl_12_buffer.c +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "common/cl/assert_cl.h" - -#include "extent_cl_12.h" -#include "runtime_cl_12.h" -#include "styling_cl_12.h" -#include "composition_cl_12.h" - -#include "context.h" -#include "surface.h" - -// -// -// - -#include - -// -// BUILD -// - -struct skc_surface_impl -{ - struct skc_surface * surface; - struct skc_runtime * runtime; - - // framebuffer - // struct skc_extent_pdrw fb; - // struct skc_extent_phrN_pdwN fb; - - // for now, a single in-order command queue - cl_command_queue cq; - - struct { - cl_kernel render; - } kernels; -}; - -// -// we might want concurrent access to the same surface as long as -// the clips don't overlap. -// -// this would require acquiring a cq on demand when it is determined -// that the clipped render won't overlap -// -// { tile clip , cq } pair -// -// skc_uint4 clip; -// cl_command_queue cq -// - -struct skc_surface_render -{ - skc_uint clip[4]; - - struct skc_surface_impl * impl; - struct skc_styling * styling; - struct skc_composition * composition; - - skc_surface_render_pfn_notify notify; - void * data; - - cl_mem fb; - - skc_grid_t grid; - - skc_subbuf_id_t id; -}; - -// -// -// - -static -void -skc_surface_pfn_clear(struct skc_surface_impl * const impl, - float const rgba[4], - skc_uint const rect[4], - void * fb) -{ - size_t const origin[3] = { rect[0], rect[1], 0 }; - size_t const region[3] = { rect[2], rect[3], 1 }; - - cl(EnqueueFillImage(impl->cq, - (cl_mem)fb, - rgba, - origin, - region, - 0,NULL,NULL)); -} - -// -// -// - -static -void -skc_surface_pfn_blit(struct skc_surface_impl * const impl, - skc_uint const rect[4], - skc_int const txty[2]) -{ - ; -} - -// -// -// - -#if 0 // #ifndef NDEBUG -#define SKC_SURFACE_DEBUG -#endif - -#ifdef SKC_SURFACE_DEBUG - -#define SKC_SURFACE_WIDTH 4096 -#define SKC_SURFACE_HEIGHT 4096 - -static -void -skc_surface_debug(struct skc_surface_impl * const impl) -{ - // - // MAP - // - cl_uchar4 * const rgba = skc_extent_phrN_pdwN_map(&impl->fb, - impl->cq, - NULL); - cl(Finish(impl->cq)); - - // - // WRITE - // - FILE* file; - - errno_t ferr = fopen_s(&file,"surface.ppm","wb"); - - fprintf(file,"P6\n%u %u\n255\n",SKC_SURFACE_WIDTH,SKC_SURFACE_HEIGHT); - - for (skc_uint ii=0; iifb,rgba,impl->cq,NULL); - - cl(Flush(impl->cq)); -} - -#endif - -// -// -// - -void -skc_surface_render_complete(struct skc_surface_render * const render) -{ -#ifdef SKC_SURFACE_DEBUG - // write fb out - skc_surface_debug(render->impl); -#endif - - // notify - if (render->notify != NULL) { - render->notify(render->impl->surface, - render->styling, - render->composition, - render->data); - } - - // unlock and release the styling and composition - skc_styling_unlock_and_release(render->styling); - skc_composition_unlock_and_release(render->composition); - - // grid is now complete - skc_grid_complete(render->grid); -} - -static -void -skc_surface_render_cb(cl_event event, cl_int status, struct skc_surface_render * const render) -{ - SKC_CL_CB(status); - - // as quickly as possible, enqueue next stage in pipeline to context command scheduler - SKC_SCHEDULER_SCHEDULE(render->impl->runtime->scheduler, - skc_surface_render_complete, - render); -} - -// -// -// - -static -void -skc_surface_grid_pfn_execute(skc_grid_t const grid) -{ - struct skc_surface_render * const render = skc_grid_get_data(grid); - struct skc_surface_impl * const impl = render->impl; - struct skc_runtime * const runtime = impl->runtime; - - // get the composition args - struct skc_composition_impl * const ci = render->composition->impl; - struct skc_place_atomics * const atomics = ci->atomics.hr; - - if (atomics->offsets > 0) - { - // acquire the rbo - cl(EnqueueAcquireGLObjects(impl->cq,1,&render->fb,0,NULL,NULL)); - - // get the styling args - struct skc_styling_impl * const si = render->styling->impl; - - cl(SetKernelArg(impl->kernels.render,0,SKC_CL_ARG(si->layers.drN))); - cl(SetKernelArg(impl->kernels.render,1,SKC_CL_ARG(si->groups.drN))); - cl(SetKernelArg(impl->kernels.render,2,SKC_CL_ARG(si->extras.drN))); - - cl(SetKernelArg(impl->kernels.render,3,SKC_CL_ARG(ci->keys.drw))); - cl(SetKernelArg(impl->kernels.render,4,SKC_CL_ARG(atomics->keys))); - cl(SetKernelArg(impl->kernels.render,5,SKC_CL_ARG(ci->offsets.drw))); - cl(SetKernelArg(impl->kernels.render,6,SKC_CL_ARG(atomics->offsets))); - - // block pool - cl(SetKernelArg(impl->kernels.render,7,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); - - // surface - cl(SetKernelArg(impl->kernels.render,8,SKC_CL_ARG(render->fb))); - -#if 1 - // tile clip - cl(SetKernelArg(impl->kernels.render,9,sizeof(skc_uint4),render->clip)); -#else - // surface pitch (height) - skc_uint const surface_pitch = SKC_SURFACE_HEIGHT; - cl(SetKernelArg(impl->kernels.render,9,SKC_CL_ARG(surface_pitch))); - // tile clip - cl(SetKernelArg(impl->kernels.render,10,sizeof(skc_uint4),render->clip)); -#endif - - // launch render kernel - skc_device_enqueue_kernel(runtime->device, - SKC_DEVICE_KERNEL_ID_RENDER, - impl->cq, - impl->kernels.render, - atomics->offsets, - 0,NULL,NULL); - - - cl_event complete; - - // give the rbo back - cl(EnqueueReleaseGLObjects(impl->cq,1,&render->fb,0,NULL,&complete)); - - // notify anyone listening... - cl(SetEventCallback(complete,CL_COMPLETE,skc_surface_render_cb,render)); - cl(ReleaseEvent(complete)); - - // flush it - cl(Flush(impl->cq)); - } - else - { - skc_surface_render_complete(render); - } -} - -// -// -// - -static -void -skc_surface_pfn_release(struct skc_surface_impl * const impl) -{ - if (--impl->surface->ref_count != 0) - return; - - // - // otherwise, release all resources - // - - // drain the command queue - cl(Finish(impl->cq)); - - struct skc_runtime * const runtime = impl->runtime; - - // release the kernel - cl(ReleaseKernel(impl->kernels.render)); - - // free surface host - skc_runtime_host_perm_free(runtime,impl->surface); - - // release the cq - skc_runtime_release_cq_in_order(runtime,impl->cq); - - // release fb - // skc_extent_phrN_pdwN_free(runtime,&impl->fb); - - // free surface impl - skc_runtime_host_perm_free(runtime,impl); -} - -// -// -// - -static -void -skc_surface_grid_pfn_dispose(skc_grid_t const grid) -{ - struct skc_surface_render * const render = skc_grid_get_data(grid); - struct skc_surface_impl * const impl = render->impl; - struct skc_runtime * const runtime = impl->runtime; - - // free the render object - skc_runtime_host_temp_free(runtime,render,render->id); - - // release the surface - skc_surface_pfn_release(impl); -} - -// -// -// - -static -void -skc_surface_pfn_render(struct skc_surface_impl * const impl, - uint32_t const clip[4], - skc_styling_t styling, - skc_composition_t composition, - skc_surface_render_pfn_notify notify, - void * data, - void * fb) -{ - // retain surface - skc_surface_retain(impl->surface); - - // - // FIXME -- we used to seal the styling and composition objects if - // they weren't already. Either test that they're sealed or seal - // them here. - // - - // retain and lock the styling and composition - skc_styling_retain_and_lock(styling); - skc_composition_retain_and_lock(composition); - - // - // allocate a render instance - // - skc_subbuf_id_t id; - struct skc_surface_render * const render = skc_runtime_host_temp_alloc(impl->runtime, - SKC_MEM_FLAGS_READ_WRITE, - sizeof(*render),&id,NULL); - render->id = id; - - render->clip[0] = clip[0]; - render->clip[1] = clip[1]; - render->clip[2] = clip[2]; - render->clip[3] = clip[3]; - - render->impl = impl; - render->styling = styling; - render->composition = composition; - - render->notify = notify; - render->data = data; - - render->fb = fb; - - render->grid = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, - NULL, // invalidation not necessary - render, - NULL, // no waiting - skc_surface_grid_pfn_execute, - skc_surface_grid_pfn_dispose); - - // declare happens-after relationships - skc_grid_happens_after_grid(render->grid,styling->impl->grid); - skc_grid_happens_after_grid(render->grid,composition->impl->grids.sort); - - // wait for styling and composition - skc_grid_start(render->grid); -} - -// -// -// - -skc_err -skc_surface_cl_12_create(struct skc_context * const context, - struct skc_surface * * const surface) -{ - struct skc_runtime * const runtime = context->runtime; - - // allocate surface - (*surface) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**surface)); - - // allocate impl - struct skc_surface_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); - - // initialize surface - // SKC_ASSERT_STATE_INIT((*impl),SKC_SURFACE_STATE_READY); - - (*surface)->context = context; - (*surface)->impl = impl; - (*surface)->ref_count = 1; - - (*surface)->release = skc_surface_pfn_release; - (*surface)->clear = skc_surface_pfn_clear; - (*surface)->blit = skc_surface_pfn_blit; - (*surface)->render = skc_surface_pfn_render; - - // intialize impl - impl->surface = *surface; - impl->runtime = runtime; - -#if 0 - // FIXME -- 4K x 4K -- temporarily fixed size - size_t const fb_size = sizeof(skc_uchar4) * SKC_SURFACE_WIDTH * SKC_SURFACE_HEIGHT; - - // create framebuffer - skc_extent_phrN_pdwN_alloc(runtime,&impl->fb,fb_size); -#endif - - // acquire a command queue - impl->cq = skc_runtime_acquire_cq_in_order(runtime); - - // acquire kernel - impl->kernels.render = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_RENDER); - - return SKC_ERR_SUCCESS; -} - -// -// -// diff --git a/src/compute/skc/types.h b/src/compute/skc/types.h index 6d6d19aba2..655cea0ad4 100644 --- a/src/compute/skc/types.h +++ b/src/compute/skc/types.h @@ -38,12 +38,6 @@ // // -#include - -// -// -// - #define SKC_TYPE_HELPER(t) skc_##t #define SKC_TYPE(t) SKC_TYPE_HELPER(t) @@ -114,16 +108,16 @@ typedef cl_float16 skc_float16; typedef cl_half skc_half; -#if defined( __CL_HALF2__) +#if defined(__CL_HALF2__) typedef cl_half2 skc_half2; #endif -#if defined( __CL_HALF4__) +#if defined(__CL_HALF4__) typedef cl_half4 skc_half4; #endif -#if defined( __CL_HALF8__) +#if defined(__CL_HALF8__) typedef cl_half8 skc_half8; #endif -#if defined( __CL_HALF16__) +#if defined(__CL_HALF16__) typedef cl_half16 skc_half16; #endif @@ -206,16 +200,16 @@ typedef float16 skc_float16; typedef half skc_half; -#if defined( __CL_HALF2__) +#if defined(__CL_HALF2__) typedef half2 skc_half2; #endif -#if defined( __CL_HALF4__) +#if defined(__CL_HALF4__) typedef half4 skc_half4; #endif -#if defined( __CL_HALF8__) +#if defined(__CL_HALF8__) typedef half8 skc_half8; #endif -#if defined( __CL_HALF16__) +#if defined(__CL_HALF16__) typedef half16 skc_half16; #endif @@ -243,12 +237,6 @@ typedef half16 skc_half16; // // -#endif - -// -// -// - #define SKC_UCHAR_MAX 0xFF #define SKC_SHORT_MAX 0x7FFF @@ -265,3 +253,9 @@ typedef half16 skc_half16; // // +#endif + +// +// +// + -- cgit v1.2.3