diff options
author | Allan MacKinnon <allanmac@google.com> | 2018-06-20 08:29:07 -0700 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2018-06-20 16:03:28 +0000 |
commit | 879c98913c360b01f63588685c01ac06e83be54d (patch) | |
tree | bf91e5e6ded3d97936936678b72c6499502f0462 | |
parent | a27f2694f0af467f496e6697bc0c4edd4966a3e0 (diff) |
Overdue reorg of source tree to support multiple platforms & devices.
Bug: skia:
Change-Id: I1248a529a932ed5ef32952a1bb7eca56ee1c5f25
Reviewed-on: https://skia-review.googlesource.com/136170
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
-rw-r--r-- | src/compute/skc/Makefile | 79 | ||||
-rw-r--r-- | src/compute/skc/cl_20/extent.c | 787 | ||||
-rw-r--r-- | src/compute/skc/cl_20/extent.h | 390 | ||||
-rw-r--r-- | src/compute/skc/cl_20/ring_cl_svm_fine.cpp | 89 | ||||
-rw-r--r-- | src/compute/skc/cl_20/ring_cl_svm_fine.h | 46 | ||||
-rw-r--r-- | src/compute/skc/common.h | 2 | ||||
-rw-r--r-- | src/compute/skc/main.c | 20 | ||||
-rw-r--r-- | src/compute/skc/make_all.bat | 15 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/allocator_device_cl.c (renamed from src/compute/skc/allocator_device_cl.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/allocator_device_cl.h (renamed from src/compute/skc/allocator_device_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/atomic_cl.h (renamed from src/compute/skc/atomic_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/block_pool_cl.h (renamed from src/compute/skc/block_pool_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/block_pool_cl_12.h (renamed from src/compute/skc/block_pool_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/composition_cl_12.c (renamed from src/compute/skc/composition_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/composition_cl_12.h (renamed from src/compute/skc/composition_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/config_cl.h (renamed from src/compute/skc/config_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/cq_pool_cl.c (renamed from src/compute/skc/cq_pool_cl.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/cq_pool_cl.h (renamed from src/compute/skc/cq_pool_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/device_cl_12.h (renamed from src/compute/skc/device_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/export_cl_12.h (renamed from src/compute/skc/export_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/extent_cl_12.c (renamed from src/compute/skc/extent_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/extent_cl_12.h (renamed from src/compute/skc/extent_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/extent_cl_12_unified.c (renamed from src/compute/skc/extent_cl_12_unified.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/gl/interop.c (renamed from src/compute/skc/interop.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/gl/interop.h (renamed from src/compute/skc/interop.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/handle_pool_cl_12.c (renamed from src/compute/skc/handle_pool_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/handle_pool_cl_12.h (renamed from src/compute/skc/handle_pool_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl (renamed from src/compute/skc/block_pool_init.cl) | 128 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h (renamed from src/compute/skc/device_cl_12_avx2.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c (renamed from src/compute/skc/device_cl_12_gen9.c) | 84 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h (renamed from src/compute/skc/device_cl_12_gen9.h) | 10 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat | 15 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat (renamed from src/compute/skc/make_inl_cl.bat) | 21 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/fills_expand.cl (renamed from src/compute/skc/fills_expand.cl) | 618 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/paths_copy.cl (renamed from src/compute/skc/paths_copy.cl) | 1086 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl (renamed from src/compute/skc/paths_reclaim.cl) | 780 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/place.cl (renamed from src/compute/skc/place.cl) | 1742 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/prefix.cl (renamed from src/compute/skc/prefix.cl) | 2083 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/rasterize.cl (renamed from src/compute/skc/rasterize.cl) | 6733 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl (renamed from src/compute/skc/rasters_alloc.cl) | 288 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl (renamed from src/compute/skc/rasters_reclaim.cl) | 884 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/render.cl (renamed from src/compute/skc/render.cl) | 4330 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl (renamed from src/compute/skc/segment_ttck.cl) | 261 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl (renamed from src/compute/skc/segment_ttrk.cl) | 790 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/path_builder_cl_12.c (renamed from src/compute/skc/path_builder_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/path_builder_cl_12.h (renamed from src/compute/skc/path_builder_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/raster_builder_cl_12.c (renamed from src/compute/skc/raster_builder_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/raster_builder_cl_12.h (renamed from src/compute/skc/raster_builder_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/runtime_cl.c (renamed from src/compute/skc/runtime_cl.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/runtime_cl.h (renamed from src/compute/skc/runtime_cl.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/runtime_cl_12.c (renamed from src/compute/skc/runtime_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/runtime_cl_12.h (renamed from src/compute/skc/runtime_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/styling_cl_12.c (renamed from src/compute/skc/styling_cl_12.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/styling_cl_12.h (renamed from src/compute/skc/styling_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/surface_cl_12.h (renamed from src/compute/skc/surface_cl_12.h) | 0 | ||||
-rw-r--r-- | src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c (renamed from src/compute/skc/surface_cl_12_buffer.c) | 0 | ||||
-rw-r--r-- | src/compute/skc/types.h | 34 |
57 files changed, 9965 insertions, 11350 deletions
diff --git a/src/compute/skc/Makefile b/src/compute/skc/Makefile deleted file mode 100644 index e6516e3fd1..0000000000 --- a/src/compute/skc/Makefile +++ /dev/null @@ -1,79 +0,0 @@ -#
-# Copyright 2016 Google Inc.
-#
-# Use of this source code is governed by a BSD-style license that can
-# be found in the LICENSE file.
-#
-
-SRC = block_pool_init.cl paths_copy.cl fills_expand.cl rasterize.cl raster_alloc.cl prefix.cl place.cl render.cl
-
-PRE = $(SRC:%.cl=%.pre.cl)
-
-IR_GEN9 = $(PRE:%.cl=%.ir)
-
-$(info PRE : $(PRE))
-$(info IR_GEN9 : $(IR_GEN9))
-
-#
-#
-#
-
-OPENCL_STD = -cl-std=CL1.2
-OPENCL_PRE = __OPENCL_C_VERSION__=120
-
-# OPENCL_STD = -cl-std=CL2.0
-# OPENCL_PRE = __OPENCL_C_VERSION__=200
-
-#
-#
-#
-
-TARGETS = $(PRE) $(IR_GEN9)
-
-#
-#
-#
-
-IOC = ioc64
-
-IOC_IR_OPTS_OPT = $(OPENCL_STD) -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable \
- -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
-
-IOC_IR_OPTS_DBG = $(OPENCL_STD) -cl-kernel-arg-info -g
-
-IOC_IR_OPTS = $(IOC_IR_OPTS_OPT)
-
-#
-#
-#
-
-PRE_DEPS = $(wildcard *.h)
-
-#
-#
-#
-
-all: $(TARGETS)
-
-
-clean:
- -rm -f $(TARGETS) $(wildcard *.pre.bin.inl) $(wildcard *.pre.src.inl) $(wildcard *.gen) $(wildcard *.TMP)
-
-#
-# PREPROCESS
-#
-
-$(PRE): %.pre.cl: %.cl $(PRE_DEPS)
- cl -I . -I "%INTELOCLSDKROOT%\include" -D $(OPENCL_PRE) -EP $< -P -Fi"$@"
- clang-format -i $@
- dos2unix $@
- xxd -i $@ $(basename $@).src.inl
-
-#
-# GEN9 -- supports OpenCL 2.0 and can emit SPIR-V / SPIR-V TEXT but cannot load it via clCreateProgramWithIL()
-#
-
-$(IR_GEN9): %.ir: %.cl
- touch $@
- $(IOC) -cmd=build -bo="$(IOC_IR_OPTS)" -device=gpu -input=$< -ir=$@ -asm
- xxd -i $@ $(basename $@).bin.inl
diff --git a/src/compute/skc/cl_20/extent.c b/src/compute/skc/cl_20/extent.c deleted file mode 100644 index 4c073e8b69..0000000000 --- a/src/compute/skc/cl_20/extent.c +++ /dev/null @@ -1,787 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include <string.h> - -// #include "extent.h" - -// -// EXTENT TYPES -// -// Classification of operations on allocated GPU memory -// -// h = host -// d = device -// -// c = append using non-atomic incremented count -// x = append using atomically incremented index -// p = allocated from pool of indices -// g = gathered by pull kernel -// s = size is available -// -// w1 = write once -// wN = write many -// -// r1 = read once -// rN = read many -// -// rw = read/write many -// -// host<>device memory model -// +--------------------+--------------------+ -// extent type | split | shared | examples -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_atomic | device+mapped | device+mapped | atomically op'd device extent + read-only host snapshot -// | | | -// extent_dxrw | device | device | ttsk_array, ttpk_array, ttck_array, *_offsets -// extent_hcw1_dr1 | mapped | mapped | command_queue, buffer -// extent_hcrw | host | host | queue -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_hcw1_drN | memcpy'd | mapped | stack_transforms, stack_stroke_props -// extent_hgw1_drN | scatter/gather | mapped | layer_props -// | | | -// block_pool_dprw | device | device | ttsb_pool, ttpb_pool -// block_pool_hp_drw | device | device | raster_pool -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// block_pool_hp_drw | block_pool_hp_drw | block_pool_hp_drw | path_block_pool -// staging buffer | extent_hw_dr | -- | -// | | | -// - -// -// HIGH-LEVEL EXTENTS ARE BUILT FROM SIMPLER STRUCTURES -// - -// -// COUNTERS FOR POOLS -- TYPICALLY ATOMIC WHEN ON DEVICE -// - -union skc_ring -{ - skc_uint2 u32v2; - - skc_uint u32a2[2]; - - struct { - skc_uint reads; // number of reads - skc_uint writes; // number of writes - }; -}; - -// -// POOL OF INDICES TO BLOCKS -// - -struct skc_pool_h -{ - skc_uint * indices; -}; - -struct skc_pool_d -{ - cl_mem * indices; // FIXME -- READ POOL INDICES THROUGH CONSTANT CACHE? -}; - -// -// LOW-LEVEL EXTENTS -- SIZES ARE STORED ELSEWHERE -// - -struct skc_extent_hrw -{ - void * hrw; // host pointer to host extent -- read/write -}; - -struct skc_extent_drw -{ - cl_mem drw; // device pointer to device extent -- read/write -}; - -struct skc_extent_hw_dr -{ - void * hw; // host pointer to shared extent -- write-only + write-combined - cl_mem dr; // device pointer to shared extent -- read-only -}; - -// -// -// - -#if 0 -static -void * -skc_runtime_svm_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size) -{ - return clSVMAlloc(runtime_cl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - 0); -} - -static -void * -skc_runtime_svm_atomic_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size) // WE DON'T NEED THIS HERE -{ - return clSVMAlloc(runtime_cl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, - size, - 0); -} - -static -void -skc_runtime_svm_free(struct skc_runtime_cl * const runtime_cl, void * const buffer) -{ - clSVMFree(runtime_cl->context,buffer); -} -#endif - -// -// -// - -void -skc_command_queue_fill_device(struct skc_command_queue * const cq, - cl_mem buffer, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -void * -skc_command_queue_map_wi(struct skc_command_queue * const cq, - cl_mem buffer); - -void -skc_command_queue_unmap(struct skc_command_queue * const cq, - cl_mem buffer, - void * const mapped); - -void -skc_command_queue_read(struct skc_command_queue * const cq, - cl_mem buffer, - void * const ptr); - -// -// -// - -struct skc_extent_hrw * -skc_extent_hrw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hrw = skc_allocator_alloc_host(allocator,size); - - return extent; -} - - - -void -skc_extent_hrw_free(struct skc_allocator * const allocator, - struct skc_extent_hrw * const extent) -{ - skc_allocator_free_host(allocator,extent->hrw); - skc_allocator_free_host(allocator,extent); -} - -// -// -// - -struct skc_extent_drw * -skc_extent_drw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_drw * extent; - - extent = skc_allocator_alloc_host (allocator,sizeof(*extent)); - extent->drw = skc_allocator_alloc_device(allocator,size); - - return extent; -} - -void -skc_extent_drw_free(struct skc_allocator * const allocator, - struct skc_extent_drw * const extent) -{ - skc_allocator_free_device(allocator,extent->drw); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size) -{ - skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size); -} - -// -// WRITE-COMBINED / WRITE-INVALIDATE -// - -struct skc_extent_hw_dr * -skc_extent_hw_dr_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hw_dr * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw = NULL; - extent->dr = skc_allocator_alloc_device_wc(allocator,size); // write-combined mem - - return extent; -} - -void -skc_extent_hw_dr_free(struct skc_allocator * const allocator, - struct skc_extent_hw_dr * const extent) -{ - skc_allocator_free_device(allocator,extent->dr); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_hw_dr_map(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent) -{ - extent->hw = skc_command_queue_map_wi(cq,extent->dr); -} - -void -skc_extent_hw_dr_unmap(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent) -{ - skc_command_queue_unmap(cq,extent->dr,extent->hw); -} - -void -skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent, - void const * SKC_RESTRICT const src, - size_t const offset, - size_t const size) -{ - void * SKC_RESTRICT const dst = (char *)extent->hw + offset; - - memcpy(dst,src,size); -} -// -// SNAPSHOT -// - -struct skc_extent_hr_drw -{ - void * hr; // host pointer to shared extent -- readable snapshot - cl_mem drw; // device pointer to shared extent -- read/write -}; - -struct skc_extent_hr_drw * -skc_extent_hr_drw_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_hr_drw * extent; - - extent = skc_allocator_alloc_host (allocator,sizeof(*extent)); - extent->hr = skc_allocator_alloc_host (allocator,size); - extent->drw = skc_allocator_alloc_device(allocator,size); - - return extent; -} - -void -skc_extent_hr_drw_free(struct skc_allocator * const allocator, - struct skc_extent_hr_drw * const extent) -{ - skc_allocator_free_host (allocator,extent->hr); - skc_allocator_free_device(allocator,extent->drw); - skc_allocator_free_host (allocator,extent); -} - -void -skc_extent_hr_drw_snap(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - size_t const size) -{ - skc_command_queue_read(cq,extent->drw,extent->hr); -} - -void -skc_extent_hr_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size) -{ - skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size); -} - -// -// -// - -struct skc_extent_atomic -{ - struct skc_extent_hr_drw * hr_drw; - size_t size; // typically a very small extent -}; - -// -// -// - -struct skc_extent_atomic * -skc_extent_atomic_alloc(struct skc_allocator * const allocator, - size_t const size) -{ - struct skc_extent_atomic * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hr_drw = skc_extent_hr_drw_alloc(allocator,size); - extent->size = size; - - return extent; -} - -void -skc_extent_atomic_free(struct skc_allocator * const allocator, - struct skc_extent_atomic * const extent) -{ - skc_extent_hr_drw_free (allocator,extent->hr_drw); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_atomic_snap(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent) -{ - skc_extent_hr_drw_snap(cq,extent->hr_drw,extent->size); -} - -void -skc_extent_atomic_zero(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent) -{ - skc_uint const zero = 0; - - skc_extent_hr_drw_fill(cq,extent->hr_drw,&zero,sizeof(zero),extent->size); -} - -// -// -// - -struct skc_extent_dxrw -{ - struct skc_extent_drw * drw; - - size_t elem_size; - skc_uint elem_count; - -#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED - struct skc_extent_atomic * atomic; - size_t atomic_offset; -#endif -}; - -// -// -// - -struct skc_extent_dxrw * -skc_extent_dxrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count, - struct skc_extent_atomic * const atomic, - size_t const atomic_offset) -{ - struct skc_extent_dxrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->drw = skc_extent_drw_alloc(allocator,elem_size * elem_count); - - extent->elem_size = elem_size; - extent->elem_count = elem_count; - - // - // note that passing in the atomic and its member has no real use at - // this point since the current programming style requires passing - // in the atomic extent -- which may have multiple members -- to the - // compute kernel - // -#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED - extent->atomic = atomic; - extent->atomic_offset = atomic_offset; -#endif - - return extent; -} - -void -skc_extent_dxrw_free(struct skc_allocator * const allocator, - struct skc_extent_dxrw * const extent) -{ - skc_extent_drw_free (allocator,extent->drw); - skc_allocator_free_host(allocator,extent); -} - -// -// -// - -struct skc_extent_hcrw -{ - struct skc_extent_hrw * hrw; - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcrw * -skc_extent_hcrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcrw * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hrw = skc_extent_hrw_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcrw_free(struct skc_allocator * const allocator, - struct skc_extent_hcrw * const extent) -{ - skc_extent_hrw_free (allocator,extent->hrw); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent) -{ - return (extent->counter == extent->elem_count); -} - -// -// -// - -struct skc_extent_hcw1_dr1 -{ - struct skc_extent_hw_dr * hw_dr; // mapped memory - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcw1_dr1 * -skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcw1_dr1 * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw_dr = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcw1_dr1_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_free (allocator,extent->hw_dr); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcw1_dr1_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_map(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_dr1_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent) -{ - skc_extent_hw_dr_unmap(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent) -{ - return (extent->counter == extent->elem_count); -} - -skc_uint -skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent) -{ - return extent->elem_count - extent->counter; -} - -void -skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped) -{ - skc_extent_hw_dr_memcpy(extent->hw_dr, - elem_ptr, - extent->elem_size * extent->counter, - extent->elem_size * elem_count_clamped); -} - -// -// -// - -struct skc_extent_hcw1_drN_unified -{ - struct skc_extent_hw_dr * hw_dr; // mapped memory - size_t elem_size; - skc_uint elem_count; - skc_uint counter; -}; - -// -// -// - -struct skc_extent_hcw1_drN_unified * -skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - struct skc_extent_hcw1_drN_unified * extent; - - extent = skc_allocator_alloc_host(allocator,sizeof(*extent)); - extent->hw_dr = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count); - extent->elem_size = elem_size; - extent->elem_count = elem_count; - extent->counter = 0; - - return extent; -} - -void -skc_extent_hcw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_free (allocator,extent->hw_dr); - skc_allocator_free_host(allocator,extent); -} - -void -skc_extent_hcw1_drN_unified_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_map(cq,extent->hw_dr); -} - - -void -skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent) -{ - skc_extent_hw_dr_unmap(cq,extent->hw_dr); -} - -void -skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent) -{ - extent->counter = 0; -} - -skc_bool -skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent) -{ - return (extent->counter == extent->elem_count); -} - - -skc_uint -skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent) -{ - return extent->elem_count - extent->counter; -} - - -void -skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped) -{ - skc_extent_hw_dr_memcpy(extent->hw_dr, - elem_ptr, - extent->elem_size * extent->counter, - extent->elem_size * elem_count_clamped); -} - -// -// -// - -struct skc_id_pool_hp * -skc_id_pool_hp_alloc(struct skc_allocator * const allocator, - skc_uint const count) -{ - return NULL; -} - -void -skc_id_pool_hp_free(struct skc_allocator * const allocator, - struct skc_id_pool_hp * const extent) -{ - ; -} - -void -skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, - skc_uint * const id) -{ - ; -} - -void -skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, - skc_uint const id) -{ - ; -} - -void -skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, - skc_uint const * const id, - skc_uint const count) -{ - ; -} - -// -// -// - -struct skc_block_pool_dprw * -skc_block_pool_dprw_alloc(struct skc_allocator * const allocator, - union skc_ring * const ring_d, - skc_uint const block_size, - skc_uint const block_count) -{ - return NULL; -} - -void -skc_block_pool_dprw_free(struct skc_allocator * const allocator, - struct skc_block_pool_dprw * const extent) -{ - ; -} - -// -// -// - -struct skc_extent_hgw1_drN * -skc_extent_hgw1_drN_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - return NULL; -} - -void -skc_extent_hgw1_drN_free(struct skc_allocator * const allocator, - struct skc_extent_hgw1_drN * const extent) -{ - ; -} - -void -skc_extent_hgw1_drN_reset(struct skc_extent_hgw1_drN * const extent) -{ - ; -} - -void -skc_extent_hgw1_drN_snap(struct skc_command_queue * const cq, - struct skc_extent_hgw1_drN const * const extent) -{ - ; -} - -// -// -// - -#if 0 - -// -// -// - -struct skc_block_pool_hp_drw * -skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count) -{ - return NULL; -} - -void -skc_block_pool_hp_drw_free(struct skc_allocator * const allocator, - struct skc_block_pool_hp_drw * const extent) -{ - ; -} - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/skc/cl_20/extent.h b/src/compute/skc/cl_20/extent.h deleted file mode 100644 index 2993968a50..0000000000 --- a/src/compute/skc/cl_20/extent.h +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// -// - -#include "skc.h" -#include "allocator.h" - -// -// EXTENT TYPES -// -// Classification of operations on allocated GPU memory -// -// h = host -// d = device -// -// c = append using non-atomic incremented count -// x = append using atomically incremented index -// p = allocated from pool of indices -// g = gathered by pull kernel -// -// w1 = write once -// wN = write many -// -// r1 = read once -// rN = read many -// -// rw = read/write many -// -// host<>device memory model -// +--------------------+--------------------+ -// extent type | split | shared | examples -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_atomic | device+mapped | device+mapped | atomically op'd device extent + read-only host snapshot -// | | | -// extent_dxrw | device | device | ttsk_array, ttpk_array, ttck_array, *_offsets -// extent_hcw1_dr1 | mapped | mapped | command_queue, buffer -// extent_hcrw | host | host | queue -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// extent_hcw1_drN | memcpy'd | mapped | stack_transforms, stack_stroke_props -// extent_hgw1_drN | scatter/gather | mapped | layer_props -// | | | -// block_pool_dprw | device | device | ttsb_pool, ttpb_pool -// block_pool_hp_drw | device | device | raster_pool -// | | | -// ---------------------+--------------------+--------------------+-------------------- -// | | | -// block_pool_hp_drw | block_pool_hp_drw | block_pool_hp_drw | path_block_pool -// staging buffer | extent_hw_dr | -- | -// | | | -// - -struct skc_extent_hrw; -struct skc_extent_drw; - -struct skc_extent_hrw_drN; -struct skc_extent_hw1_drN; -struct skc_extent_hrN_drw; - -struct skc_extent_atomic; - -struct skc_extent_hcrw; -struct skc_extent_dxrw; - -struct skc_block_pool_dprw; - -struct skc_id_pool_hp; - -struct skc_extent_hcw1_dr1; -struct skc_extent_hcw1_drN; -struct skc_extent_hgw1_drN; - -// -// -// - -void * -skc_extent_hrw_drN_get_hrw(struct skc_extent_hrw_drN * extent); - -void * -skc_extent_hw1_drN_get_hw1(struct skc_extent_hw1_drN * extent); - -// -// -// - -struct skc_extent_hrw * -skc_extent_hrw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hrw_free(struct skc_allocator * const allocator, - struct skc_extent_hrw * const extent); - -void * -skc_extent_hrw_get_hrw(struct skc_extent_hrw * extent); - -// -// -// - -struct skc_extent_drw * -skc_extent_drw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_drw_free(struct skc_allocator * const allocator, - struct skc_extent_drw * const extent); - -void -skc_extent_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -// -// -// - -struct skc_extent_hw_dr * -skc_extent_hw_dr_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hw_dr_free(struct skc_allocator * const allocator, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_map(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_unmap(struct skc_command_queue * const cq, - struct skc_extent_hw_dr * const extent); - -void -skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent, - void const * SKC_RESTRICT const src, - size_t const offset, - size_t const size); -// -// -// - -struct skc_extent_hr_drw * -skc_extent_hr_drw_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_hr_drw_free(struct skc_allocator * const allocator, - struct skc_extent_hr_drw * const extent); - -void -skc_extent_hr_drw_snap(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - size_t const size); - -void -skc_extent_hr_drw_fill(struct skc_command_queue * const cq, - struct skc_extent_hr_drw * const extent, - void const * const pattern, - size_t const pattern_size, - size_t const size); - -// -// -// - -struct skc_extent_atomic * -skc_extent_atomic_alloc(struct skc_allocator * const allocator, - size_t const size); - -void -skc_extent_atomic_free(struct skc_allocator * const allocator, - struct skc_extent_atomic * const extent); - -void -skc_extent_atomic_snap(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent); - -void -skc_extent_atomic_zero(struct skc_command_queue * const cq, - struct skc_extent_atomic const * const extent); - -// -// -// - - -struct skc_extent_dxrw * -skc_extent_dxrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count, - struct skc_extent_atomic * const atomic, - size_t const atomic_offset); - -void -skc_extent_dxrw_free(struct skc_allocator * const allocator, - struct skc_extent_dxrw * const extent); - -// -// -// - -struct skc_extent_hcrw * -skc_extent_hcrw_alloc(struct skc_allocator * const allocator, - size_t const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcrw_free(struct skc_allocator * const allocator, - struct skc_extent_hcrw * const extent); - -void -skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent); - -skc_bool -skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent); - -// -// -// - -struct skc_extent_hcw1_dr1 * -skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcw1_dr1_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent); - -skc_bool -skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent); - -skc_uint -skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent); - -void -skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped); - -// -// Note: on a shared memory device this reuses the hcw1_dr1 -// implementation and unmaps the extent instead of copying -// - -struct skc_extent_hcw1_drN_unified * -skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hcw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_map(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue * const cq, - struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent); - -skc_bool -skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent); - -skc_uint -skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent); - -void -skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent, - void const * SKC_RESTRICT const elem_ptr, - skc_uint const elem_count_clamped); -// -// -// - -struct skc_id_pool_hp * -skc_id_pool_hp_alloc(struct skc_allocator * const allocator, - skc_uint const count); - -void -skc_id_pool_hp_free(struct skc_allocator * const allocator, - struct skc_id_pool_hp * const extent); - -void -skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, - skc_uint * const id); - -void -skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, - skc_uint const id); - -void -skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, - skc_uint const * const id, - skc_uint const count); - -// -// -// - -struct skc_block_pool_dprw * -skc_block_pool_dprw_alloc(struct skc_allocator * const allocator, - union skc_ring * const ring_d, - skc_uint const block_size, - skc_uint const block_count); - -void -skc_block_pool_dprw_free(struct skc_allocator * const allocator, - struct skc_block_pool_dprw * const extent); - -// -// -// - -struct skc_extent_hgw1_drN_unified * -skc_extent_hgw1_drN_unified_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_extent_hgw1_drN_unified_free(struct skc_allocator * const allocator, - struct skc_extent_hgw1_drN_unified * const extent); - -void -skc_extent_hgw1_drN_unified_reset(struct skc_extent_hgw1_drN_unified * const extent); - -void -skc_extent_hgw1_drN_unified_snap(struct skc_command_queue * const cq, - struct skc_extent_hgw1_drN_unified const * const extent); - -// -// -// - -#if 0 - -// -// -// - -struct skc_block_pool_hp_drw * -skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator, - skc_uint const elem_size, - skc_uint const elem_count); - -void -skc_block_pool_hp_drw_free(struct skc_allocator * const allocator, - struct skc_block_pool_hp_drw * const extent); - -// -// -// - -#endif - -// -// -// - diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp b/src/compute/skc/cl_20/ring_cl_svm_fine.cpp deleted file mode 100644 index 9552c81f2d..0000000000 --- a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// Fine-grained shared virtual memory ring -// -// There is limited support for C11 atomics in C compilers so -// implement this module in C++11 -// - -extern "C" { - -#include "runtime.h" -#include "ring_cl_svm_fine.h" - -} - -// -// -// - -#include <atomic> - -// -// -// - -union skc_ring -{ - std::atomic<skc_uint> rw[2]; - - struct { - std::atomic<skc_uint> reads; // number of reads - std::atomic<skc_uint> writes; // number of writes - }; -}; - -// -// -// - -union skc_ring * -skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl) -{ - return (union skc_ring *) - clSVMAlloc(runtime_impl->context, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, - sizeof(union skc_ring), - 0); -} - -void -skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes) -{ - ring->reads = ATOMIC_VAR_INIT(0); - ring->writes = ATOMIC_VAR_INIT(writes); -} - -void -skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring) -{ - clSVMFree(runtime_impl->context,ring); -} - -// -// -// - -skc_uint -skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n) -{ - return atomic_fetch_add_explicit(&ring->reads,n,std::memory_order_relaxed); -} - -skc_uint -skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n) -{ - return atomic_fetch_add_explicit(&ring->writes,n,std::memory_order_relaxed); -} - -// -// -// - diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.h b/src/compute/skc/cl_20/ring_cl_svm_fine.h deleted file mode 100644 index 65ff9f71f3..0000000000 --- a/src/compute/skc/cl_20/ring_cl_svm_fine.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// Fine-grained shared virtual memory ring -// - -#include "runtime.h" -#include "types.h" - -// -// -// - -union skc_ring * -skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl); - -void -skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring); - -// -// -// - -void -skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes); - -// -// -// - -skc_uint -skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n); - -skc_uint -skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n); - -// -// -// - diff --git a/src/compute/skc/common.h b/src/compute/skc/common.h index 618ba2242e..5ac42ab2dc 100644 --- a/src/compute/skc/common.h +++ b/src/compute/skc/common.h @@ -9,6 +9,8 @@ #ifndef SKC_COMMON_ONCE #define SKC_COMMON_ONCE +#include "types.h" + // // structures common to both host and device -- placeholder until // everything shakes out diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c index 8833b0bb1c..8261f4bdf8 100644 --- a/src/compute/skc/main.c +++ b/src/compute/skc/main.c @@ -30,7 +30,7 @@ // #include <CL/opencl.h> -#include "interop.h" +#include "platforms/cl_12/gl/interop.h" // // @@ -49,7 +49,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context); // // -static +static void is_render_complete(skc_surface_t surface, skc_styling_t styling, @@ -67,9 +67,9 @@ int main(int argc, char** argv) { // + // // - // - if (argc <= 1) + if (argc <= 1) { fprintf(stderr,"-- missing filename\n"); return EXIT_FAILURE; // no filename @@ -110,7 +110,7 @@ main(int argc, char** argv) CL_WGL_HDC_KHR, (cl_context_properties)hDC, 0 }; - + // // create context // @@ -136,14 +136,14 @@ main(int argc, char** argv) skc_raster_builder_t raster_builder; err = skc_raster_builder_create(context,&raster_builder); - + // // create a composition // skc_composition_t composition; err = skc_composition_create(context,&composition); - + // // create a styling instance // @@ -154,7 +154,7 @@ main(int argc, char** argv) svg_doc_layer_count(svg_doc), 1000, 2 * 1024 * 1024); - + // // create a surface // @@ -191,7 +191,7 @@ main(int argc, char** argv) skc_transform_stack_restore(ts,ts_save); // decode layers -- places rasters - svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); + svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); // seal the composition skc_composition_seal(composition); @@ -244,7 +244,7 @@ main(int argc, char** argv) // unseal the composition skc_composition_unseal(composition,true); } - + // // dispose of mundane resources // diff --git a/src/compute/skc/make_all.bat b/src/compute/skc/make_all.bat deleted file mode 100644 index 4772cc73b4..0000000000 --- a/src/compute/skc/make_all.bat +++ /dev/null @@ -1,15 +0,0 @@ -@ECHO OFF
-
-CMD /C make_inl_cl.bat block_pool_init.cl
-CMD /C make_inl_cl.bat fills_expand.cl
-CMD /C make_inl_cl.bat paths_copy.cl
-CMD /C make_inl_cl.bat rasterize.cl
-CMD /C make_inl_cl.bat segment_ttrk.cl
-CMD /C make_inl_cl.bat rasters_alloc.cl
-CMD /C make_inl_cl.bat prefix.cl
-CMD /C make_inl_cl.bat place.cl
-CMD /C make_inl_cl.bat segment_ttck.cl
-CMD /C make_inl_cl.bat render.cl
-CMD /C make_inl_cl.bat paths_reclaim.cl
-CMD /C make_inl_cl.bat rasters_reclaim.cl
-
diff --git a/src/compute/skc/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c index aa44f36e87..aa44f36e87 100644 --- a/src/compute/skc/allocator_device_cl.c +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c diff --git a/src/compute/skc/allocator_device_cl.h b/src/compute/skc/platforms/cl_12/allocator_device_cl.h index 67d4e41398..67d4e41398 100644 --- a/src/compute/skc/allocator_device_cl.h +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.h diff --git a/src/compute/skc/atomic_cl.h b/src/compute/skc/platforms/cl_12/atomic_cl.h index c196c36390..c196c36390 100644 --- a/src/compute/skc/atomic_cl.h +++ b/src/compute/skc/platforms/cl_12/atomic_cl.h diff --git a/src/compute/skc/block_pool_cl.h b/src/compute/skc/platforms/cl_12/block_pool_cl.h index c88370919e..c88370919e 100644 --- a/src/compute/skc/block_pool_cl.h +++ b/src/compute/skc/platforms/cl_12/block_pool_cl.h diff --git a/src/compute/skc/block_pool_cl_12.h b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h index 6fa8a39ca0..6fa8a39ca0 100644 --- a/src/compute/skc/block_pool_cl_12.h +++ b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h diff --git a/src/compute/skc/composition_cl_12.c b/src/compute/skc/platforms/cl_12/composition_cl_12.c index 7853564636..7853564636 100644 --- a/src/compute/skc/composition_cl_12.c +++ b/src/compute/skc/platforms/cl_12/composition_cl_12.c diff --git a/src/compute/skc/composition_cl_12.h b/src/compute/skc/platforms/cl_12/composition_cl_12.h index 4f52090658..4f52090658 100644 --- a/src/compute/skc/composition_cl_12.h +++ b/src/compute/skc/platforms/cl_12/composition_cl_12.h diff --git a/src/compute/skc/config_cl.h b/src/compute/skc/platforms/cl_12/config_cl.h index 0172857b07..0172857b07 100644 --- a/src/compute/skc/config_cl.h +++ b/src/compute/skc/platforms/cl_12/config_cl.h diff --git a/src/compute/skc/cq_pool_cl.c b/src/compute/skc/platforms/cl_12/cq_pool_cl.c index 80cfe34cf8..80cfe34cf8 100644 --- a/src/compute/skc/cq_pool_cl.c +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c diff --git a/src/compute/skc/cq_pool_cl.h b/src/compute/skc/platforms/cl_12/cq_pool_cl.h index 0cc73a2f82..0cc73a2f82 100644 --- a/src/compute/skc/cq_pool_cl.h +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h diff --git a/src/compute/skc/device_cl_12.h b/src/compute/skc/platforms/cl_12/device_cl_12.h index 637b61ae10..637b61ae10 100644 --- a/src/compute/skc/device_cl_12.h +++ b/src/compute/skc/platforms/cl_12/device_cl_12.h diff --git a/src/compute/skc/export_cl_12.h b/src/compute/skc/platforms/cl_12/export_cl_12.h index e577282791..e577282791 100644 --- a/src/compute/skc/export_cl_12.h +++ b/src/compute/skc/platforms/cl_12/export_cl_12.h diff --git a/src/compute/skc/extent_cl_12.c b/src/compute/skc/platforms/cl_12/extent_cl_12.c index 73676d8063..73676d8063 100644 --- a/src/compute/skc/extent_cl_12.c +++ b/src/compute/skc/platforms/cl_12/extent_cl_12.c diff --git a/src/compute/skc/extent_cl_12.h b/src/compute/skc/platforms/cl_12/extent_cl_12.h index 47ba951bb3..47ba951bb3 100644 --- a/src/compute/skc/extent_cl_12.h +++ b/src/compute/skc/platforms/cl_12/extent_cl_12.h diff --git a/src/compute/skc/extent_cl_12_unified.c b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c index 69c669ad54..69c669ad54 100644 --- a/src/compute/skc/extent_cl_12_unified.c +++ b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c diff --git a/src/compute/skc/interop.c b/src/compute/skc/platforms/cl_12/gl/interop.c index 6697bb7e83..6697bb7e83 100644 --- a/src/compute/skc/interop.c +++ b/src/compute/skc/platforms/cl_12/gl/interop.c diff --git a/src/compute/skc/interop.h b/src/compute/skc/platforms/cl_12/gl/interop.h index 112d365764..112d365764 100644 --- a/src/compute/skc/interop.h +++ b/src/compute/skc/platforms/cl_12/gl/interop.h diff --git a/src/compute/skc/handle_pool_cl_12.c b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c index 65288c3656..65288c3656 100644 --- a/src/compute/skc/handle_pool_cl_12.c +++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c diff --git a/src/compute/skc/handle_pool_cl_12.h b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h index 4fefae3552..4fefae3552 100644 --- a/src/compute/skc/handle_pool_cl_12.h +++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h diff --git a/src/compute/skc/block_pool_init.cl b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl index 023dff44cf..726b0a7907 100644 --- a/src/compute/skc/block_pool_init.cl +++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl @@ -1,64 +1,64 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" - -// -// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ -// - -__kernel -SKC_BP_INIT_IDS_KERNEL_ATTRIBS -void -skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size) -{ - uint const gid = get_global_id(0); - - // - // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to - // accomplish this with fewer threads and using either IPC and/or - // vector stores -- it should be on certain architectures! - // - - // - // initialize pool with sequence - // - if (gid < bp_size) - ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK; -} - -// -// -// - -__kernel -SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS -void -skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size) -{ - // the version test is to squelch a bug with the Intel OpenCL CPU - // compiler declaring it supports the cl_intel_subgroups extension -#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups) - uint const tid = get_sub_group_local_id(); -#else - uint const tid = get_local_id(0); -#endif - - // - // launch two threads and store [ 0, bp_size ] - // - bp_atomics[tid] = tid * bp_size; -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "device_cl_12.h"
+
+//
+// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
+//
+
+__kernel
+SKC_BP_INIT_IDS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
+{
+ uint const gid = get_global_id(0);
+
+ //
+ // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
+ // accomplish this with fewer threads and using either IPC and/or
+ // vector stores -- it should be on certain architectures!
+ //
+
+ //
+ // initialize pool with sequence
+ //
+ if (gid < bp_size)
+ ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
+}
+
+//
+//
+//
+
+__kernel
+SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
+{
+ // the version test is to squelch a bug with the Intel OpenCL CPU
+ // compiler declaring it supports the cl_intel_subgroups extension
+#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
+ uint const tid = get_sub_group_local_id();
+#else
+ uint const tid = get_local_id(0);
+#endif
+
+ //
+ // launch two threads and store [ 0, bp_size ]
+ //
+ bp_atomics[tid] = tid * bp_size;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/device_cl_12_avx2.h b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h index e68579c0f7..e68579c0f7 100644 --- a/src/compute/skc/device_cl_12_avx2.h +++ b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h diff --git a/src/compute/skc/device_cl_12_gen9.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c index 5b4d9d2dd2..aebe8fdc1d 100644 --- a/src/compute/skc/device_cl_12_gen9.c +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c @@ -11,13 +11,15 @@ #include <assert.h> #include "common/cl/assert_cl.h" + +#include "tile.h" +#include "raster.h" #include "macros.h" #include "config_cl.h" #include "runtime_cl_12.h" -#include "raster.h" -#include "tile.h" +#include "device_cl_12.h" #include "hs/cl/hs_cl_launcher.h" #include "hs/cl/gen9/hs_cl.h" @@ -36,58 +38,52 @@ #if SKC_KERNEL_SPIRV -#include "block_pool_init.pre.spv.inl" -#include "paths_copy.pre.spv.inl" -#include "fills_expand.pre.spv.inl" -#include "rasterize.pre.spv.inl" -#include "segment_ttrk.pre.spv.inl" -#include "rasters_alloc.pre.spv.inl" -#include "prefix.pre.spv.inl" -#include "place.pre.spv.inl" -#include "segment_ttck.pre.spv.inl" -#include "render.pre.spv.inl" -#include "paths_reclaim.pre.spv.inl" -#include "rasters_reclaim.pre.spv.inl" +#include "inl/block_pool_init.pre.spv.inl" +#include "inl/paths_copy.pre.spv.inl" +#include "inl/fills_expand.pre.spv.inl" +#include "inl/rasterize.pre.spv.inl" +#include "inl/segment_ttrk.pre.spv.inl" +#include "inl/rasters_alloc.pre.spv.inl" +#include "inl/prefix.pre.spv.inl" +#include "inl/place.pre.spv.inl" +#include "inl/segment_ttck.pre.spv.inl" +#include "inl/render.pre.spv.inl" +#include "inl/paths_reclaim.pre.spv.inl" +#include "inl/rasters_reclaim.pre.spv.inl" #elif SKC_KERNEL_BINARY -#include "block_pool_init.pre.bin.inl" -#include "paths_copy.pre.bin.inl" -#include "fills_expand.pre.bin.inl" -#include "rasterize.pre.bin.inl" -#include "segment_ttrk.pre.bin.inl" -#include "rasters_alloc.pre.bin.inl" -#include "prefix.pre.bin.inl" -#include "place.pre.bin.inl" -#include "segment_ttck.pre.bin.inl" -#include "render.pre.bin.inl" -#include "paths_reclaim.pre.bin.inl" -#include "rasters_reclaim.pre.bin.inl" +#include "inl/block_pool_init.pre.bin.inl" +#include "inl/paths_copy.pre.bin.inl" +#include "inl/fills_expand.pre.bin.inl" +#include "inl/rasterize.pre.bin.inl" +#include "inl/segment_ttrk.pre.bin.inl" +#include "inl/rasters_alloc.pre.bin.inl" +#include "inl/prefix.pre.bin.inl" +#include "inl/place.pre.bin.inl" +#include "inl/segment_ttck.pre.bin.inl" +#include "inl/render.pre.bin.inl" +#include "inl/paths_reclaim.pre.bin.inl" +#include "inl/rasters_reclaim.pre.bin.inl" #elif SKC_KERNEL_SRC -#include "block_pool_init.pre.src.inl" -#include "paths_copy.pre.src.inl" -#include "fills_expand.pre.src.inl" -#include "rasterize.pre.src.inl" -#include "segment_ttrk.pre.src.inl" -#include "rasters_alloc.pre.src.inl" -#include "prefix.pre.src.inl" -#include "place.pre.src.inl" -#include "segment_ttck.pre.src.inl" -#include "render.pre.src.inl" -#include "paths_reclaim.pre.src.inl" -#include "rasters_reclaim.pre.src.inl" +#include "inl/block_pool_init.pre.src.inl" +#include "inl/paths_copy.pre.src.inl" +#include "inl/fills_expand.pre.src.inl" +#include "inl/rasterize.pre.src.inl" +#include "inl/segment_ttrk.pre.src.inl" +#include "inl/rasters_alloc.pre.src.inl" +#include "inl/prefix.pre.src.inl" +#include "inl/place.pre.src.inl" +#include "inl/segment_ttck.pre.src.inl" +#include "inl/render.pre.src.inl" +#include "inl/paths_reclaim.pre.src.inl" +#include "inl/rasters_reclaim.pre.src.inl" #endif // -// -// - -#include "device_cl_12_gen9.h" - -// // FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY // diff --git a/src/compute/skc/device_cl_12_gen9.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h index dd69a845c2..0cac2261e7 100644 --- a/src/compute/skc/device_cl_12_gen9.h +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h @@ -6,8 +6,8 @@ * */ -#ifndef SKC_ONCE_DEVICE_CL_12_GEN9_H -#define SKC_ONCE_DEVICE_CL_12_GEN9_H +#ifndef SKC_ONCE_DEVICE_CL_12_H +#define SKC_ONCE_DEVICE_CL_12_H // // FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE @@ -18,6 +18,12 @@ #include "block.h" // +// +// + +#include <hs/cl/gen9/hs_cl_macros.h> + +// // HOW TO SELECT A SUBBLOCK AND BLOCK SIZES: // // 1) The subblock size should match the natural SIMT/SIMD width of diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat new file mode 100644 index 0000000000..3631271d9b --- /dev/null +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat @@ -0,0 +1,15 @@ +@ECHO OFF
+
+CMD /C make_inl_cl.bat ..\..\..\block_pool_init.cl
+CMD /C make_inl_cl.bat ..\..\..\fills_expand.cl
+CMD /C make_inl_cl.bat ..\..\..\paths_copy.cl
+CMD /C make_inl_cl.bat ..\..\..\rasterize.cl
+CMD /C make_inl_cl.bat ..\..\..\segment_ttrk.cl
+CMD /C make_inl_cl.bat ..\..\..\rasters_alloc.cl
+CMD /C make_inl_cl.bat ..\..\..\prefix.cl
+CMD /C make_inl_cl.bat ..\..\..\place.cl
+CMD /C make_inl_cl.bat ..\..\..\segment_ttck.cl
+CMD /C make_inl_cl.bat ..\..\..\render.cl
+CMD /C make_inl_cl.bat ..\..\..\paths_reclaim.cl
+CMD /C make_inl_cl.bat ..\..\..\rasters_reclaim.cl
+
diff --git a/src/compute/skc/make_inl_cl.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat index 777a5f3bc2..e3b0b37651 100644 --- a/src/compute/skc/make_inl_cl.bat +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat @@ -1,5 +1,9 @@ @ECHO OFF
+::
+:: TARGET OPENCL 1.2
+::
+
SET OPENCL_STD=-cl-std=CL1.2
SET OPENCL_PRE=__OPENCL_C_VERSION__=120
@@ -26,9 +30,8 @@ SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% ::
::
-SET PRE_DIR=%~p1
-
-CD %PRE_DIR%
+REM SET PRE_DIR=%~p1
+REM CD %PRE_DIR%
SET PRE_CL=%~n1
SET PRE_CL=%PRE_CL%.pre.cl
@@ -43,11 +46,21 @@ SET PRE_BIN_INL=%~n1 SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
::
+::
+::
+
+SET DIR_CL12="%INTELOCLSDKROOT%include"
+SET DIR_COMPUTE=..\..\..\..\..\..\..
+SET DIR_SKC=%DIR_COMPUTE%\skc
+SET DIR_PLATFORM=%DIR_SKC%\platforms\cl_12
+SET DIR_DEVICE=..
+
+::
:: *.pre.cl
:: *.pre.src.inl
::
-CMD /C cl -I . -I .. -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C cl -I %DIR_CL12% -I %DIR_DEVICE% -I %DIR_PLATFORM% -I %DIR_SKC% -I %DIR_COMPUTE% -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
CMD /C clang-format -style=Mozilla -i %PRE_CL%
CMD /C dos2unix -q %PRE_CL%
CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
diff --git a/src/compute/skc/fills_expand.cl b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl index b6f56794c5..39fee75f3d 100644 --- a/src/compute/skc/fills_expand.cl +++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl @@ -1,309 +1,309 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "atomic_cl.h" -#include "block.h" -#include "path.h" -#include "common.h" - -// -// -// - -#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) - -#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) -#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) - -#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// -// - -#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// -// - -#if ( SKC_FILLS_EXPAND_X == 1 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_FILLS_EXPAND_X == 2 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_FILLS_EXPAND_X == 4 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_FILLS_EXPAND_X == 8 ) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_FILLS_EXPAND_X == 16) -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_FILLS_EXPAND_X" -#endif - -// -// Fill and rasterize cmds only differ in their first word semantics -// - -union skc_cmd_expand -{ - union skc_cmd_fill fill; - union skc_cmd_rasterize rasterize; -}; - -// -// -// - -union skc_path_elem -{ - skc_uint u32; - skc_float f32; -}; - -// -// COMPILE-TIME AND RUN-TIME MACROS -// - -#define SKC_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -#define SKC_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// -// - -void -skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out, - skc_uint * const out_idx, - union skc_cmd_expand * const cmd, - union skc_path_elem const e, - skc_uint const e_idx) -{ - // - // FIXME -- we can append a large number of nodeword indices to a - // local SMEM queue and flush when full. It may or may not be a - // performance win on some architectures. - // - skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT; - skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0); - - cmd->rasterize.nodeword = e_idx; - - if (is_elem) { - cmds_out[*out_idx + offset] = cmd->rasterize; - } - - *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1); -} - -// -// -// - -__kernel -SKC_FILLS_EXPAND_KERNEL_ATTRIBS -void -skc_kernel_fills_expand(__global union skc_path_elem const * const blocks, - __global skc_uint volatile * const atomics, - __global skc_block_id_t const * const map, - __global union skc_cmd_fill const * const cmds_in, - __global union skc_cmd_rasterize * const cmds_out) -{ - // - // Need to harmonize the way we determine a subgroup's id. In this - // kernel it's not as important because no local memory is being - // used. Although the device/mask calc to determine subgroup and - // lanes is still proper, we might want to make it clearer that - // we're working with subgroups by using the subgroup API. - // - // every subgroup/simd that will work on the block loads the same command - // -#if (__OPENCL_VERSION__ < 200) - skc_uint const cmd_stride = get_num_sub_groups(); -#else - skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id(); - - // load fill command -- we reuse y component - union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] }; - - // get the path header block from the map - skc_block_id_t id = map[cmd.fill.path]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("expand[%u] = %u\n",cmd_idx,id); -#endif - - // - // blindly load all of the head elements into registers - // - skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - skc_uint count_nodes, count_prims; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ - count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \ - } \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \ - count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // debug of path head - // -#if 0 - skc_uint count_blocks; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ - count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - if (get_sub_group_local_id() == 0) - printf("path header = { %5u, %5u, %5u }\n", - count_blocks,count_nodes,count_prims); -#endif - - // - // acquire slots in the expanded cmd extent - // - // decrement prim_idx by 1 so we can use inclusive warp scan later - // - skc_uint out_idx = 0; - - if (get_sub_group_local_id() == 0) { - out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP - (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1; - } - - out_idx = sub_group_broadcast(out_idx,0); - - // - // process ids trailing the path header - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ - if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ - if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \ - h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \ - } \ - } \ - skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \ - head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \ - } - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, process the nodes - // - - // - // get id of next node - // - id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); - - // - // the following blocks are nodes - // - while (true) - { - // get index of each element - skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); - - // - // blindly load all of the node elements into registers - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // - // append all valid ids - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \ - node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); - - SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); - - // any more nodes? - if (--count_nodes == 0) - return; - - // - // get id of next node - // - id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
+#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if ( SKC_FILLS_EXPAND_X == 1 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
+
+#elif ( SKC_FILLS_EXPAND_X == 2 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
+
+#elif ( SKC_FILLS_EXPAND_X == 4 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
+
+#elif ( SKC_FILLS_EXPAND_X == 8 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
+
+#elif ( SKC_FILLS_EXPAND_X == 16)
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
+
+#else
+#error "MISSING SKC_FILLS_EXPAND_X"
+#endif
+
+//
+// Fill and rasterize cmds only differ in their first word semantics
+//
+
+union skc_cmd_expand
+{
+ union skc_cmd_fill fill;
+ union skc_cmd_rasterize rasterize;
+};
+
+//
+//
+//
+
+union skc_path_elem
+{
+ skc_uint u32;
+ skc_float f32;
+};
+
+//
+// COMPILE-TIME AND RUN-TIME MACROS
+//
+
+#define SKC_ELEM_IN_RANGE(X,I) \
+ (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
+ (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_ELEM_GTE(X,I) \
+ SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I) \
+ sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I) \
+ sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I) \
+ SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+//
+//
+
+void
+skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
+ skc_uint * const out_idx,
+ union skc_cmd_expand * const cmd,
+ union skc_path_elem const e,
+ skc_uint const e_idx)
+{
+ //
+ // FIXME -- we can append a large number of nodeword indices to a
+ // local SMEM queue and flush when full. It may or may not be a
+ // performance win on some architectures.
+ //
+ skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
+ skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
+
+ cmd->rasterize.nodeword = e_idx;
+
+ if (is_elem) {
+ cmds_out[*out_idx + offset] = cmd->rasterize;
+ }
+
+ *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
+}
+
+//
+//
+//
+
+__kernel
+SKC_FILLS_EXPAND_KERNEL_ATTRIBS
+void
+skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
+ __global skc_uint volatile * const atomics,
+ __global skc_block_id_t const * const map,
+ __global union skc_cmd_fill const * const cmds_in,
+ __global union skc_cmd_rasterize * const cmds_out)
+{
+ //
+ // Need to harmonize the way we determine a subgroup's id. In this
+ // kernel it's not as important because no local memory is being
+ // used. Although the device/mask calc to determine subgroup and
+ // lanes is still proper, we might want to make it clearer that
+ // we're working with subgroups by using the subgroup API.
+ //
+ // every subgroup/simd that will work on the block loads the same command
+ //
+#if (__OPENCL_VERSION__ < 200)
+ skc_uint const cmd_stride = get_num_sub_groups();
+#else
+ skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+ skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
+
+ // load fill command -- we reuse y component
+ union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
+
+ // get the path header block from the map
+ skc_block_id_t id = map[cmd.fill.path];
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("expand[%u] = %u\n",cmd_idx,id);
+#endif
+
+ //
+ // blindly load all of the head elements into registers
+ //
+ skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ //
+ // pick out count.nodes and count.prims from the header
+ //
+ skc_uint count_nodes, count_prims;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+ count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
+ } \
+ if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
+ count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
+ }
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ //
+ // debug of path head
+ //
+#if 0
+ skc_uint count_blocks;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+ count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+ }
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ if (get_sub_group_local_id() == 0)
+ printf("path header = { %5u, %5u, %5u }\n",
+ count_blocks,count_nodes,count_prims);
+#endif
+
+ //
+ // acquire slots in the expanded cmd extent
+ //
+ // decrement prim_idx by 1 so we can use inclusive warp scan later
+ //
+ skc_uint out_idx = 0;
+
+ if (get_sub_group_local_id() == 0) {
+ out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
+ (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
+ }
+
+ out_idx = sub_group_broadcast(out_idx,0);
+
+ //
+ // process ids trailing the path header
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
+ if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
+ if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
+ h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
+ } \
+ } \
+ skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
+ head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
+ }
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ //
+ // we're done if it was just the header
+ //
+ if (count_nodes == 0)
+ return;
+
+ //
+ // otherwise, process the nodes
+ //
+
+ //
+ // get id of next node
+ //
+ id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+
+ //
+ // the following blocks are nodes
+ //
+ while (true)
+ {
+ // get index of each element
+ skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+ //
+ // blindly load all of the node elements into registers
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ //
+ // append all valid ids
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
+ node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
+
+ SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+ // any more nodes?
+ if (--count_nodes == 0)
+ return;
+
+ //
+ // get id of next node
+ //
+ id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl index 06cc393c75..302ea14af2 100644 --- a/src/compute/skc/paths_copy.cl +++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl @@ -1,543 +1,543 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "path_builder_cl_12.h" -#include "path.h" -#include "block_pool_cl.h" - -// -// -// - -#if 0 - -// -// SIMD AVX2 -// - -#define SKC_PATHS_COPY_WORDS_PER_ELEM 8 -#define SKC_PATHS_COPY_SUBGROUP_SIZE 1 -#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES - -typedef skc_uint8 skc_paths_copy_elem; -typedef skc_uint8 skc_pb_idx_v; - -#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() - -#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) - -#endif - -// -// -// - -#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) -#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) -#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) -#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) - -// FIXME -- use SUBGROUP terminology everywhere -#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) - -// -// -// - -#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ - (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) - -#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ - (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) - -// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) - -// -// -// - -// -// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL -// - -#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) - -#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) - -#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) - -#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) - -// -// -// - -skc_uint -skc_sub_group_local_id() -{ -#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 - return get_sub_group_local_id(); -#else - return 0; -#endif -} - -// -// convert an atomic read counter offset to a block id -// - -skc_block_id_t -skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, - skc_uint const bp_idx_mask, - skc_uint const bp_reads, - skc_uint const bp_off) -{ - skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; - - return bp_ids[bp_idx]; -} - -// -// -// - -void -skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to - skc_uint const bp_elems_idx, - __global skc_paths_copy_elem const * const pb_elems, // from - skc_uint const pb_elems_idx) -{ - for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) - { - (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii]; - } - -#if 0 - // - // NOTE THIS IS PRINTING 8 ROWS - // - printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", - (skc_uint)get_global_id(0),pb_elems_idx, - as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE])); - printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", - (skc_uint)get_global_id(0),pb_elems_idx, - as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]), - as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE])); -#endif -} - -// -// -// - -void -skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to - skc_uint const bp_elems_idx, - __global skc_block_id_t const * const bp_ids, - skc_uint const bp_reads, - skc_uint const bp_idx_mask, - __global skc_paths_copy_elem const * const pb_elems, // from - skc_uint const pb_elems_idx, - skc_uint const pb_rolling) -{ - // - // remap block id tags bp_elems the host-side rolling counter pb_elems a - // device-side block pool id - // - for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) - { - // load block_id_tag words - skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii]; - - // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid - skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; - - // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS - - // - // FIXME -- SIMD can be fully parallelized since a bp_ids[] load - // will _always_ be safe as long as we don't use the loaded - // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead - // of iterating over the vector components. - // - - // only convert if original elem is not invalid - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ - skc_block_id_t const b = bp_ids[bp_idx C]; \ - elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ - } - - // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C); - - SKC_PATHS_COPY_ELEM_EXPAND(); - - // store the elem back - (bp_elems+bp_elems_idx)[ii] = elem; - } -} - -// -// -// - -void -skc_host_map_update(__global skc_uint * const host_map, - skc_uint const block, - skc_paths_copy_elem const elem) -{ - // - // write first elem to map -- FIXME -- this is a little nasty - // because it relies on the the host handle always being the first - // word in the path header. - // - // OTOH, this is not unreasonable. The alternative is to have a - // separate kernel initializing the map. - // -#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 - if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) -#endif - { -#if SKC_PATHS_COPY_ELEM_WORDS == 1 - host_map[elem] = block; -#if 0 - printf("[%u] = %u\n",elem,block); -#endif -#else - host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; -#endif - } -} - -// -// -// - -void -skc_copy_head(__global skc_uint * const host_map, - skc_uint const block, - __global skc_paths_copy_elem * const bp_elems, // to - skc_uint const bp_elems_idx, - __global skc_block_id_t const * const bp_ids, - skc_uint const bp_reads, - skc_uint const bp_idx_mask, - __global skc_paths_copy_elem const * const pb_elems, // from - skc_uint const pb_elems_idx, - skc_uint const pb_rolling) -{ - // - // if there are more path header words than there are - // threads-per-block then we can just copy the initial header words - // -#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) - for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) - { - skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii]; - - (bp_elems+bp_elems_idx)[ii] = elem; - - if (ii == 0) { - skc_host_map_update(host_map,block,elem); - } - } -#endif - - // - // this is similar to copy node but the first H words of the path - // header are not modified and simply copied - // - for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) - { - skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; - -#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 ) - if (ii == 0) { - skc_host_map_update(host_map,block,elem); - } -#endif - // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid - skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; - - // - // FIXME -- SIMD can be fully parallelized since a bp_ids[] load - // will _always_ be safe as long as we don't use the loaded - // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead - // of iterating over the vector components. - // - - // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS - - // FIXME -- MIX MIX MIX MIX / SELECT - - // only convert if original elem is not invalid -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \ - skc_block_id_t const b = bp_ids[bp_idx C]; \ - elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ - } - - // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C); - - SKC_PATHS_COPY_ELEM_EXPAND(); - - // store the elem back - (bp_elems+bp_elems_idx)[ii] = elem; - } - - // - // the remaining words are treated like a node - // - for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) - { - // load block_id_tag words - skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; - - // calculate ahead of time - skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; - - // - // FIXME -- SIMD can be fully parallelized since a bp_ids[] load - // will _always_ be safe as long as we don't use the loaded - // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead - // of iterating over the vector components. - // - - // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS - - // only convert if original elem is not invalid -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ - skc_block_id_t const b = bp_ids[bp_idx C]; \ - elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ - } - - // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C); - - SKC_PATHS_COPY_ELEM_EXPAND(); - - // store the elem - (bp_elems+bp_elems_idx)[ii] = elem; - } -} - -// -// FIXME -- pack some of these constant integer args in a vec or struct -// - -__kernel -SKC_PATHS_COPY_KERNEL_ATTRIBS -void -skc_kernel_paths_copy -(__global skc_uint * const host_map, - - __global skc_block_id_t const * const bp_ids, - __global skc_paths_copy_elem * const bp_elems, - skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring - - __global skc_uint const * const bp_alloc, // block pool ring base - skc_uint const bp_alloc_idx,// which subbuf - - __global union skc_tagged_block_id const * const pb_cmds, - __global skc_paths_copy_elem const * const pb_elems, - - skc_uint const pb_size, // # of commands/blocks in buffer - skc_uint const pb_rolling, // shifted rolling counter base - - skc_uint const pb_prev_from, - skc_uint const pb_prev_span, - skc_uint const pb_curr_from) -{ - // - // THERE ARE 3 TYPES OF PATH COPYING COMMANDS: - // - // - HEAD - // - NODE - // - SEGS - // - // THESE ARE SUBGROUP ORIENTED KERNELS - // - // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS - // - - // - // It's likely that peak bandwidth is achievable with a single - // workgroup. - // - // So let's keep the grids modestly sized and for simplicity and - // portability, let's assume that a single workgroup can perform all - // steps in the copy. - // - // Launch as large of a workgroup as possiblex - // - // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL - // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS - // 3. FOR EACH COMMAND: - // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES. - // - NODE: CONVERT AND COPY B INDICES - // - SEGS: BULK COPY - // - // B : number of words in block -- always pow2 - // W : intelligently/arbitrarily chosen factor of B -- always pow2 - // - - // - // There are several approaches to processing the commands: - // - // 1. B threads are responsible for one block. All threads broadcast - // load a single command word. Workgroup size must be a facpb_elemsr of - // B. - // - // 2. W threads process an entire block. W will typically be the - // device's subgroup/warp/wave width. W threads broadcast load a - // single command word. - // - // 3. W threads process W blocks. W threads load W command words and - // process W blocks. - // - // Clearly (1) has low I/O intensity but will achieve high - // parallelism by activating the most possible threads. The downside - // of this kind of approach is that the kernel will occupy even a - // large GPU with low intensity work and reduce opportunities for - // concurrent kernel execution (of other kernels). - // - // See Vasily Volkov's CUDA presentation describing these tradeoffs. - // - // Note that there are many other approaches. For example, similar - // pb_elems (1) but each thread loads a pow2 vector of block data. - // - - // load the copied atomic read "base" from gmem - skc_uint const bp_reads = bp_alloc[bp_alloc_idx]; - // will always be less than 2^32 - skc_uint const gid = get_global_id(0); - // every subgroup/simd that will work on the block loads the same command - skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE; - // path builder data can be spread across two spans - skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from); - - // no need pb_elems make this branchless - if (pb_idx >= pb_size) - pb_idx -= pb_size; - - // broadcast load the command - union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; - - // what do we want pb_elems do with this block? - skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); - - // compute offset from rolling base to get index into block pool ring allocation - skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); - - // convert the pb_cmd's offset counter pb_elems a block id - skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); - printf("< %8u >\n",block); - } -#endif - - // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() - skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; - - // calculate bp_elems (to) / pb_elems (from) - skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; - skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; - - if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) - { -#if 0 - if (tid == 0) - printf("%3u, segs\n",bp_off); -#endif - skc_copy_segs(bp_elems, - bp_elems_idx, - pb_elems, - pb_elems_idx); - } - else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) - { -#if 0 - if (tid == 0) - printf("%3u, NODE\n",bp_off); -#endif - skc_copy_node(bp_elems, // to - bp_elems_idx, - bp_ids, - bp_reads, - bp_idx_mask, - pb_elems, // from - pb_elems_idx, - pb_rolling); - } - else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) - { -#if 0 - if (tid == 0) - printf("%3u, HEAD\n",bp_off); -#endif - skc_copy_head(host_map, - block, - bp_elems, // to - bp_elems_idx, - bp_ids, - bp_reads, - bp_idx_mask, - pb_elems, // from - pb_elems_idx, - pb_rolling); - } -} - -// -// -// - -__kernel -SKC_PATHS_ALLOC_KERNEL_ATTRIBS -void -skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, - __global skc_uint * const bp_alloc, - skc_uint const bp_alloc_idx, - skc_uint const pb_cmd_count) -{ - // - // allocate blocks in block pool - // - skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); - - // store in slot - bp_alloc[bp_alloc_idx] = reads; - -#if 0 - printf("pc: %8u + %u\n",reads,pb_cmd_count); -#endif -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "path.h"
+#include "block_pool_cl.h"
+#include "path_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#if 0
+
+//
+// SIMD AVX2
+//
+
+#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
+#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
+#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
+
+typedef skc_uint8 skc_paths_copy_elem;
+typedef skc_uint8 skc_pb_idx_v;
+
+#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
+
+#endif
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
+#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
+
+// FIXME -- use SUBGROUP terminology everywhere
+#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
+ (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
+ (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+//
+// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
+//
+
+#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
+
+#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
+
+//
+//
+//
+
+skc_uint
+skc_sub_group_local_id()
+{
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+ return get_sub_group_local_id();
+#else
+ return 0;
+#endif
+}
+
+//
+// convert an atomic read counter offset to a block id
+//
+
+skc_block_id_t
+skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_idx_mask,
+ skc_uint const bp_reads,
+ skc_uint const bp_off)
+{
+ skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
+
+ return bp_ids[bp_idx];
+}
+
+//
+//
+//
+
+void
+skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx)
+{
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
+ }
+
+#if 0
+ //
+ // NOTE THIS IS PRINTING 8 ROWS
+ //
+ printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+ (skc_uint)get_global_id(0),pb_elems_idx,
+ as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+ printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+ (skc_uint)get_global_id(0),pb_elems_idx,
+ as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+#endif
+}
+
+//
+//
+//
+
+void
+skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_reads,
+ skc_uint const bp_idx_mask,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx,
+ skc_uint const pb_rolling)
+{
+ //
+ // remap block id tags bp_elems the host-side rolling counter pb_elems a
+ // device-side block pool id
+ //
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ // load block_id_tag words
+ skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
+
+ // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // only convert if original elem is not invalid
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem back
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+}
+
+//
+//
+//
+
+void
+skc_host_map_update(__global skc_uint * const host_map,
+ skc_uint const block,
+ skc_paths_copy_elem const elem)
+{
+ //
+ // write first elem to map -- FIXME -- this is a little nasty
+ // because it relies on the the host handle always being the first
+ // word in the path header.
+ //
+ // OTOH, this is not unreasonable. The alternative is to have a
+ // separate kernel initializing the map.
+ //
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+ if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
+#endif
+ {
+#if SKC_PATHS_COPY_ELEM_WORDS == 1
+ host_map[elem] = block;
+#if 0
+ printf("[%u] = %u\n",elem,block);
+#endif
+#else
+ host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
+#endif
+ }
+}
+
+//
+//
+//
+
+void
+skc_copy_head(__global skc_uint * const host_map,
+ skc_uint const block,
+ __global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_reads,
+ skc_uint const bp_idx_mask,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx,
+ skc_uint const pb_rolling)
+{
+ //
+ // if there are more path header words than there are
+ // threads-per-block then we can just copy the initial header words
+ //
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
+
+ (bp_elems+bp_elems_idx)[ii] = elem;
+
+ if (ii == 0) {
+ skc_host_map_update(host_map,block,elem);
+ }
+ }
+#endif
+
+ //
+ // this is similar to copy node but the first H words of the path
+ // header are not modified and simply copied
+ //
+ for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
+ if (ii == 0) {
+ skc_host_map_update(host_map,block,elem);
+ }
+#endif
+ // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ // FIXME -- MIX MIX MIX MIX / SELECT
+
+ // only convert if original elem is not invalid
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem back
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+
+ //
+ // the remaining words are treated like a node
+ //
+ for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ // load block_id_tag words
+ skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+ // calculate ahead of time
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ // only convert if original elem is not invalid
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+}
+
+//
+// FIXME -- pack some of these constant integer args in a vec or struct
+//
+
+__kernel
+SKC_PATHS_COPY_KERNEL_ATTRIBS
+void
+skc_kernel_paths_copy
+(__global skc_uint * const host_map,
+
+ __global skc_block_id_t const * const bp_ids,
+ __global skc_paths_copy_elem * const bp_elems,
+ skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
+
+ __global skc_uint const * const bp_alloc, // block pool ring base
+ skc_uint const bp_alloc_idx,// which subbuf
+
+ __global union skc_tagged_block_id const * const pb_cmds,
+ __global skc_paths_copy_elem const * const pb_elems,
+
+ skc_uint const pb_size, // # of commands/blocks in buffer
+ skc_uint const pb_rolling, // shifted rolling counter base
+
+ skc_uint const pb_prev_from,
+ skc_uint const pb_prev_span,
+ skc_uint const pb_curr_from)
+{
+ //
+ // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
+ //
+ // - HEAD
+ // - NODE
+ // - SEGS
+ //
+ // THESE ARE SUBGROUP ORIENTED KERNELS
+ //
+ // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
+ //
+
+ //
+ // It's likely that peak bandwidth is achievable with a single
+ // workgroup.
+ //
+ // So let's keep the grids modestly sized and for simplicity and
+ // portability, let's assume that a single workgroup can perform all
+ // steps in the copy.
+ //
+ // Launch as large of a workgroup as possiblex
+ //
+ // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
+ // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
+ // 3. FOR EACH COMMAND:
+ // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
+ // - NODE: CONVERT AND COPY B INDICES
+ // - SEGS: BULK COPY
+ //
+ // B : number of words in block -- always pow2
+ // W : intelligently/arbitrarily chosen factor of B -- always pow2
+ //
+
+ //
+ // There are several approaches to processing the commands:
+ //
+ // 1. B threads are responsible for one block. All threads broadcast
+ // load a single command word. Workgroup size must be a facpb_elemsr of
+ // B.
+ //
+ // 2. W threads process an entire block. W will typically be the
+ // device's subgroup/warp/wave width. W threads broadcast load a
+ // single command word.
+ //
+ // 3. W threads process W blocks. W threads load W command words and
+ // process W blocks.
+ //
+ // Clearly (1) has low I/O intensity but will achieve high
+ // parallelism by activating the most possible threads. The downside
+ // of this kind of approach is that the kernel will occupy even a
+ // large GPU with low intensity work and reduce opportunities for
+ // concurrent kernel execution (of other kernels).
+ //
+ // See Vasily Volkov's CUDA presentation describing these tradeoffs.
+ //
+ // Note that there are many other approaches. For example, similar
+ // pb_elems (1) but each thread loads a pow2 vector of block data.
+ //
+
+ // load the copied atomic read "base" from gmem
+ skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
+ // will always be less than 2^32
+ skc_uint const gid = get_global_id(0);
+ // every subgroup/simd that will work on the block loads the same command
+ skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
+ // path builder data can be spread across two spans
+ skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
+
+ // no need pb_elems make this branchless
+ if (pb_idx >= pb_size)
+ pb_idx -= pb_size;
+
+ // broadcast load the command
+ union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
+
+ // what do we want pb_elems do with this block?
+ skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
+
+ // compute offset from rolling base to get index into block pool ring allocation
+ skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
+
+ // convert the pb_cmd's offset counter pb_elems a block id
+ skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
+ printf("< %8u >\n",block);
+ }
+#endif
+
+ // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
+ skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
+
+ // calculate bp_elems (to) / pb_elems (from)
+ skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
+ skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
+
+ if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, segs\n",bp_off);
+#endif
+ skc_copy_segs(bp_elems,
+ bp_elems_idx,
+ pb_elems,
+ pb_elems_idx);
+ }
+ else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, NODE\n",bp_off);
+#endif
+ skc_copy_node(bp_elems, // to
+ bp_elems_idx,
+ bp_ids,
+ bp_reads,
+ bp_idx_mask,
+ pb_elems, // from
+ pb_elems_idx,
+ pb_rolling);
+ }
+ else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, HEAD\n",bp_off);
+#endif
+ skc_copy_head(host_map,
+ block,
+ bp_elems, // to
+ bp_elems_idx,
+ bp_ids,
+ bp_reads,
+ bp_idx_mask,
+ pb_elems, // from
+ pb_elems_idx,
+ pb_rolling);
+ }
+}
+
+//
+//
+//
+
+__kernel
+SKC_PATHS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
+ __global skc_uint * const bp_alloc,
+ skc_uint const bp_alloc_idx,
+ skc_uint const pb_cmd_count)
+{
+ //
+ // allocate blocks in block pool
+ //
+ skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
+
+ // store in slot
+ bp_alloc[bp_alloc_idx] = reads;
+
+#if 0
+ printf("pc: %8u + %u\n",reads,pb_cmd_count);
+#endif
+}
+
+//
+//
+//
diff --git a/src/compute/skc/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl index 563160613c..2aee5dac17 100644 --- a/src/compute/skc/paths_reclaim.cl +++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl @@ -1,390 +1,390 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// FIXME -- a pre-allocation step could load the path header quads and -// total up the number of blocks in the workgroup or subgroup -// minimizing the number of later atomics adds. -// - -#include "device_cl_12_gen9.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "block.h" -#include "path.h" -#include "common.h" - -// -// -// - -#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS) - -#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS) - -// -// -// - -#if ( SKC_PATHS_RECLAIM_X == 1 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_PATHS_RECLAIM_X == 2 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_PATHS_RECLAIM_X == 4 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_PATHS_RECLAIM_X == 8 ) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_PATHS_RECLAIM_X == 16) -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_PATHS_RECLAIM_X" -#endif - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// COMPILE-TIME PREDICATES -// - -#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \ - SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I) - -#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \ - SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I) - -// -// RUN-TIME PREDICATES -// - -#define SKC_PATHS_RECLAIM_IS_HEADER(I) \ - (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS) - -// -// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL -// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK -// COMBOS (NOT NECESSARILY POW2) -// -// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR -// UINT TYPE INSTEAD OF A ULONG. -// - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 -#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint - -// -// -// - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ - (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ - ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ - S = sub_group_scan_exclusive_add(C) - -#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \ - (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK) - -// -// -// - -struct skc_reclaim -{ - skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE]; -}; - -__kernel -SKC_PATHS_RECLAIM_KERNEL_ATTRIBS -void -skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring - __global skc_uint * const bp_elems, // block pool blocks - __global skc_uint volatile * const bp_atomics, // read/write atomics - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const map, // path host-to-device map - struct skc_reclaim const reclaim) // array of host path ids -{ -#if (__OPENCL_VERSION__ < 200) - skc_uint const reclaim_stride = get_num_sub_groups(); -#else - skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); - -#if 0 - // - // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT - // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL - // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE - // RECLAMATION JOB ON THE REST OF THE PIPELINE. - // - for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) -#endif - { - // get host path id - skc_path_h const path = reclaim.aN[reclaim_idx]; - - // get the path header block from the map - skc_block_id_t id = map[path]; - - // - // blindly load all of the head elements into registers - // - skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - skc_uint count_blocks, count_nodes; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ - count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ - } \ - if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ - count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes); - } -#endif - - // - // acquire a span in the block pool ids ring for reclaimed ids - // - // FIXME count_blocks and atomic add can be done in same lane - // - skc_uint bp_ids_base = 0; - - if (get_sub_group_local_id() == 0) { - bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); - -#if 0 - printf("paths: bp_ids_base = %u\n",bp_ids_base); -#endif - } - - bp_ids_base = sub_group_broadcast(bp_ids_base,0); - - // - // shift away the tagged block id's tag - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; - } - - // - // - we'll skip subgroups that are entirely header - // - // - but we need to mark any header elements that partially fill - // a subgroup as invalid tagged block ids - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \ - if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \ - h##I = SKC_TAGGED_BLOCK_ID_INVALID; \ - } \ - } \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - { - // - // count reclaimable blocks in each lane - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ - skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = h##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // printf("P %7u ! %u\n",bp_ids_idx,h##I); - } - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, walk the nodes - // - do { - // id of next block is in last lane - id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); - - // get index of each element - skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // - // blindly load all of the node elements into registers - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // shift away the tagged block id's tag - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; - } - - // - // count reclaimable blocks in each lane - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = n##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_PATHS_RECLAIM_BLOCK_EXPAND(); - - // printf("P %7u ! %u\n",bp_ids_idx,n##I); - - // any more nodes? - } while (--count_nodes > 0); - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// FIXME -- a pre-allocation step could load the path header quads and
+// total up the number of blocks in the workgroup or subgroup
+// minimizing the number of later atomics adds.
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
+
+//
+//
+//
+
+#if ( SKC_PATHS_RECLAIM_X == 1 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
+
+#elif ( SKC_PATHS_RECLAIM_X == 2 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
+
+#elif ( SKC_PATHS_RECLAIM_X == 4 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
+
+#elif ( SKC_PATHS_RECLAIM_X == 8 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
+
+#elif ( SKC_PATHS_RECLAIM_X == 16)
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
+
+#else
+#error "MISSING SKC_PATHS_RECLAIM_X"
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I) \
+ sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I) \
+ sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I) \
+ SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
+ SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
+ (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
+ (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
+ SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
+
+#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
+ SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
+ (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
+ (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
+ ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
+ S = sub_group_scan_exclusive_add(C)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
+ (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+ skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
+ __global skc_uint * const bp_elems, // block pool blocks
+ __global skc_uint volatile * const bp_atomics, // read/write atomics
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const map, // path host-to-device map
+ struct skc_reclaim const reclaim) // array of host path ids
+{
+#if (__OPENCL_VERSION__ < 200)
+ skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+ skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+ skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+ //
+ // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+ // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+ // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+ // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+ //
+ for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+ {
+ // get host path id
+ skc_path_h const path = reclaim.aN[reclaim_idx];
+
+ // get the path header block from the map
+ skc_block_id_t id = map[path];
+
+ //
+ // blindly load all of the head elements into registers
+ //
+ skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // pick out count.nodes and count.prims from the header
+ //
+ skc_uint count_blocks, count_nodes;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+ count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+ } \
+ if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+ count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
+ }
+#endif
+
+ //
+ // acquire a span in the block pool ids ring for reclaimed ids
+ //
+ // FIXME count_blocks and atomic add can be done in same lane
+ //
+ skc_uint bp_ids_base = 0;
+
+ if (get_sub_group_local_id() == 0) {
+ bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+
+#if 0
+ printf("paths: bp_ids_base = %u\n",bp_ids_base);
+#endif
+ }
+
+ bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+ //
+ // shift away the tagged block id's tag
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+ }
+
+ //
+ // - we'll skip subgroups that are entirely header
+ //
+ // - but we need to mark any header elements that partially fill
+ // a subgroup as invalid tagged block ids
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
+ if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
+ h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
+ } \
+ } \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ {
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = h##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ // printf("P %7u ! %u\n",bp_ids_idx,h##I);
+ }
+
+ //
+ // we're done if it was just the header
+ //
+ if (count_nodes == 0)
+ return;
+
+ //
+ // otherwise, walk the nodes
+ //
+ do {
+ // id of next block is in last lane
+ id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
+
+ // get index of each element
+ skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+ //
+ // blindly load all of the node elements into registers
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // shift away the tagged block id's tag
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+ }
+
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = n##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ // printf("P %7u ! %u\n",bp_ids_idx,n##I);
+
+ // any more nodes?
+ } while (--count_nodes > 0);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl index 00f16f7843..92fa0a243d 100644 --- a/src/compute/skc/place.cl +++ b/src/compute/skc/platforms/cl_12/kernels/place.cl @@ -1,871 +1,871 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "common.h" -#include "atomic_cl.h" -#include "raster.h" -#include "tile.h" - -// -// -// - -#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) -#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) - -// -// -// - -#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK - -// -// -// - -#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) - -// -// -// - -#if ( SKC_PLACE_X == 1 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_1() -#define SKC_PLACE_EXPAND_I_LAST 0 - -#elif ( SKC_PLACE_X == 2 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_2() -#define SKC_PLACE_EXPAND_I_LAST 1 - -#elif ( SKC_PLACE_X == 4 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_4() -#define SKC_PLACE_EXPAND_I_LAST 3 - -#elif ( SKC_PLACE_X == 8 ) -#define SKC_PLACE_EXPAND() SKC_EXPAND_8() -#define SKC_PLACE_EXPAND_I_LAST 7 - -#elif ( SKC_PLACE_X == 16) -#define SKC_PLACE_EXPAND() SKC_EXPAND_16() -#define SKC_PLACE_EXPAND_I_LAST 15 -#endif - -// -// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE -// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. -// -// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE -// KERNELS USE DIFFERENT SUBGROUP SIZES. -// -// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE -// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. -// -// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER -// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY -// ONLY SUPPORT A SUBGROUP SIZE OF 16. -// - -#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) - -#define SKC_PLACE_STRIDE_H(L) (L) -#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) -#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) - -#define SKC_PLACE_STRIDE_H(L) (L) -#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) -#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask - -#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) -#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) -#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) - -#endif - -// -// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE -// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) -// - -#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) - -#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) - -#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) - -#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) - - -// -// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX -// -#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) -#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) - -// -// TTSK v2: -// -// 0 63 -// | TTSB ID | PREFIX | SPAN | X | Y | -// +---------+--------+---------+-----+-----+ -// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | -// -// -// TTPK v2: -// -// 0 63 -// | TTPB ID | PREFIX | SPAN | X | Y | -// +---------+--------+------+-----+-----+ -// | 27 | 1 (=1) | 12 | 12 | 12 | -// -// - -// -// TTCK (32-BIT COMPARE) v1: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 18 | 7 | 7 | -// -// -// TTCK (32-BIT COMPARE) v2: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 15 | 9 | 8 | -// -// -// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 27 | 1 | 1 | 18 | 9 | 8 | -// - -union skc_subgroup_smem -{ - skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE - - struct { - struct { - skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; - skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; - } lo; - - struct { - skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; - skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; - } hi; - - // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; - }; - -}; - -// -// scatter scan max -// -static -skc_int_v_t -skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, - skc_int_v_t const iss, - skc_int_v_t const ess) -{ - // - // prefix sums determine which lanes we're going to work on next - // - skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); - skc_int_v_t const scratch_idx = max(ess,0); - - // - // SIMT - // - - // - // zero the volatile smem scratchpad using vector syntax - // - smem->scratch[get_sub_group_local_id()] = ( 0 ); - - // - // store source lane at starting lane - // - if (is_scratch_store) { - smem->scratch[scratch_idx] = get_sub_group_local_id(); - } - - // - // propagate lanes to right using max scan - // - skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; - skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); - - return source; -} - -// -// -// - -static -skc_bool -skc_xk_clip(union skc_tile_clip const * const tile_clip, - skc_ttxk_t * const xk) -{ - // - // clip the sk and pk keys - // - // if fully clipped then return false - // - // alternatively -- we can expand all these keys in place - // - // alternatively -- keep sk and pk keys segregated because sk - // represents the vast majority of keys and are easier to process. - // don't mess with the fastpath! - // - return false; -} - -// -// -// - -static -skc_ttck_t -skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const sk_idx) -{ - skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 - skc_uint const hi = smem->hi.sk[sk_idx]; - - skc_ttck_t ck; - - ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id - - // FIXME -- x and y should already be clipped and shifted - skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; - skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; - - ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; - - return ck; -} - -static -skc_ttck_t -skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const pk_idx, - skc_uint const dx) -{ - skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 - skc_uint const hi = smem->hi.pk[pk_idx]; - - skc_ttck_t ck; - - ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id - - // FIXME -- x and y should already be clipped and shifted - skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; - skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; - - ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; - - return ck; -} - -// -// -// - -static -void -skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const sk) -{ - // - // Pretty sure you can never ever have an sk count equal to 0 - // - skc_uint ck_base = 0; - - // last lane performs the block pool allocation with an atomic increment - if (get_sub_group_local_id() == 0) { - ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); - } - - // broadcast base to all lanes - ck_base = sub_group_broadcast(ck_base,0); - - // convert sk keys to ck keys - for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE) - { - ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii); - } -} - -// -// -// - -static -skc_int -skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem, - skc_uint const idx) -{ - skc_uint const lo = smem->lo.pk[idx]; - skc_uint const hi = smem->hi.pk[idx]; - - skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; - skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; - - return (span_lo | span_hi) + 1; -} - -// -// -// - -static -void -skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __local union skc_subgroup_smem volatile * const smem, - union skc_cmd_place const * const cmd, - skc_uint const pk) -{ - // bail out if pk queue is empty - if (pk == 0) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("%u\n",pk); -#endif - - // - // FIXME -- this nested loop iterates over the queue processing a - // subgroup of 64-bit keys at a time. This is probably not the most - // efficient approach so investigate how to store and iterate over a - // wider than subgroup (node-sized) queue of keys. - // - - // round up so we work with full subgroups - skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; - skc_uint ii = 0; - - // nested loop that expands all ttpk keys -#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) - for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE) -#endif - { - skc_uint idx = ii + get_sub_group_local_id(); - skc_int span = 0; - - // how many tiles does this ttpk span? - if (idx < pk) - span = skc_ttpk_get_span(smem,idx); - - // we need inclusive, exclusive and total - skc_int iss = sub_group_scan_inclusive_add(span); - skc_int ess = iss - span; - skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1); - - // printf("%u : %u\n",span,iss); - // continue; - - // atomically allocate space for the pk keys - skc_uint ck_base = 0; - - // last lane performs the block pool allocation with an atomic increment - if (get_sub_group_local_id() == 0) { - ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem); - } - - // broadcast atomically allocated extent base to all lanes - skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id(); - - // - // FIXME -- this loop would probably be faster if the ttpk keys - // were held in registers and accessed with shuffles instead of - // SMEM loads - // - - // - // loop until there are no more expanded pk keys - // - while (true) - { - skc_int const source = skc_scatter_scan_max(smem,iss,ess); - skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source); - - // store valid ck keys to gmem - if (get_sub_group_local_id() < rem) { - ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx); - } - - // decrement remainder - rem -= SKC_PLACE_SUBGROUP_SIZE; - - if (rem <= 0) - break; - - // increment/decrement indices - ck_idx += SKC_PLACE_SUBGROUP_SIZE; - iss -= SKC_PLACE_SUBGROUP_SIZE; - ess -= SKC_PLACE_SUBGROUP_SIZE; - } - } -} - -// -// -// - -static -skc_uint -skc_ballot(skc_uint * const xk, skc_uint const is_xk) -{ -#if 0 - // - // FIXME -- when available, this should use the idiom: - // - // ballot() + lane_mask_less_than_or_equal + popcount() - // - // Supported by: - // - // - Vulkan 1.1 / SPIR-V 1.3 - // - CUDA - // - AVX2 (SSE*?) - // -#else - // - // otherwise, emulate with an inclusive scan (yuk) - // - skc_uint const prefix = sub_group_scan_inclusive_add(is_xk); - - skc_uint const xk_idx = *xk + prefix - is_xk; - - *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST); - -#if 0 - printf("< %3u >\n",xk_idx); -#endif - - return xk_idx; -#endif -} - -// -// -// -__kernel -SKC_PLACE_KERNEL_ATTRIBS -void -skc_kernel_place(__global skc_bp_elem_t * const bp_elems, - __global SKC_ATOMIC_UINT volatile * const place_atomics, - __global skc_ttck_t * const ck_extent, - __global union skc_cmd_place const * const cmds, - __global skc_block_id_t * const map, - skc_uint4 const clip, - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) - __local union skc_subgroup_smem volatile smem[1]; -#else - __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; - __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // This is a subgroup-centric kernel - // - // Which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // - // Test the raster's translated bounds against the composition's - // tile clip - // - // There are 3 cases: - // - // - the raster is completely clipped -> return - // - the raster is partially clipped -> all keys must clipped - // - the raster is not clipped -> no keys are tested - // - // - // There are at least 4 implementations of place and we want to - // special-case them as much as possible so that, at the least, the - // fastpath remains fast. - // - // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP - // - // - implement CLIPPED + NO TRANSLATION path - // - // - implement NO CLIP + TRANSLATION path - // - // - implement CLIPPED + TRANSLATION path - // - // - // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin - // 12:12:8 integer where: - // - // 12: ttsk - // 12: ttpk - // 8: /dev/null -- clipped or invalid key - // - // Three kinds of nodes in a raster's list: - // - // - the head node - // - an internal node - // - the final node - // - -#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const cmd_idx = get_group_id(0); -#else - skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // load command - union skc_cmd_place const cmd = cmds[cmd_idx]; - - // get the raster header from the raster host id -- scalar - skc_block_id_t id = map[cmd.raster_h]; - - // - // load all of the head block ttxk keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_raster_node_elem const h##I = { \ - .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ - bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ - }; - - SKC_PLACE_EXPAND(); - - // - // load raster header counts -- we only need the "nodes" and "keys" - // words but the keys we loaded are doublewords. - // - // FIXME -- this can be made portable with compile-time macro expansion - // - skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES - skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS - - // - // - // -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ - nodes,keys, \ - I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ - h##I.u32v2.hi,h##I.u32v2.lo, \ - h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); - - SKC_PLACE_EXPAND(); -#endif - - // -#if 0 - if (get_sub_group_local_id() == 0) { - printf("place: %u / %u / %u\n",head_id,nodes,keys); - } -#endif - - { - // - // classify every key in the header - // - // keys: 0 is not a key / 1 is a key - // skpk: 0 is sk / 1 is pk - // - skc_uint bits_keys = 0; - skc_uint bits_skpk = 0; - - // - // calculate bits_keys - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ - if (idx < keys) { \ - bits_keys |= (1u << I); \ - } \ - if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ - if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ - if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ - bits_keys &= ~(1u << I); \ - } \ - } \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // blindly calculate bits_skpk - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2X : %2X\n",bits_keys,bits_skpk); -#endif - - // - // next pointer is last element of last row. save it now because - // this might be recognized as a subgroup-uniform/scalar. - // - id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); - - // - // append SK keys first - // - skc_uint const bits_sk = bits_keys & ~bits_skpk; - skc_uint sk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint is_sk = (bits_sk >> I) & 1; \ - skc_uint sk_idx = skc_ballot(&sk,is_sk); \ - if (is_sk) { \ - smem->lo.sk[sk_idx] = h##I.xk.lo; \ - smem->hi.sk[sk_idx] = h##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // append PK keys next - // - skc_uint const bits_pk = bits_keys & bits_skpk; - skc_uint pk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ - skc_uint is_pk = (bits_pk >> I) & 1; \ - skc_uint pk_idx = skc_ballot(&pk,is_pk); \ - if (is_pk) { \ - smem->lo.pk[pk_idx] = h##I.xk.lo; \ - smem->hi.pk[pk_idx] = h##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2u * %2u\n",sk,pk); -#endif - // - // flush the keys - // - skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); - skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); - } - - // - // we're done if there was only a head node - // - if (nodes == 0) - return; - - // - // decrement keys - // - keys -= SKC_RASTER_HEAD_COUNT_KEYS; - - // - // otherwise, append keys in trailing nodes to smem - // - while (true) - { - // - // load all of the node block ttxk keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - union skc_raster_node_elem const n##I = { \ - .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ - bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ - }; - - SKC_PLACE_EXPAND(); - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ - nodes,keys, \ - I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ - n##I.u32v2.hi,n##I.u32v2.lo, \ - n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); - - SKC_PLACE_EXPAND(); -#endif - - // - // classify every key in the header - // - // keys: 0 is not a key / 1 is a key - // skpk: 0 is sk / 1 is pk - // - skc_uint bits_keys = 0; - skc_uint bits_skpk = 0; - - // - // calculate bits_keys - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ - if (idx < keys) { \ - bits_keys |= (1u << I); \ - } \ - if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ - if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ - if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ - bits_keys &= ~(1u << I); \ - } \ - } \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // blindly calculate bits_skpk - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2X : %2X\n",bits_keys,bits_skpk); -#endif - - // - // next pointer is last element of last row. save it now because - // this might be recognized as a subgroup-uniform/scalar. - // - id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); - - // - // append SK keys first - // - skc_uint const bits_sk = bits_keys & ~bits_skpk; - skc_uint sk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint is_sk = (bits_sk >> I) & 1; \ - skc_uint sk_idx = skc_ballot(&sk,is_sk); \ - if (is_sk) { \ - smem->lo.sk[sk_idx] = n##I.xk.lo; \ - smem->hi.sk[sk_idx] = n##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - - // - // append PK keys next - // - skc_uint const bits_pk = bits_keys & bits_skpk; - skc_uint pk = 0; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint is_pk = (bits_pk >> I) & 1; \ - skc_uint pk_idx = skc_ballot(&pk,is_pk); \ - if (is_pk) { \ - smem->lo.pk[pk_idx] = n##I.xk.lo; \ - smem->hi.pk[pk_idx] = n##I.xk.hi; \ - } \ - } - - SKC_PLACE_EXPAND(); - -#if 0 - printf("%2u * %2u\n",sk,pk); -#endif - // - // if total for either the sk or pk queue reaches the - // highwater mark then flush it to the extent - // - skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); - skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); - - // - // if this was the last node then we're done - // - if (--nodes == 0) - return; - - // - // otherwise decrement keys - // - keys -= SKC_RASTER_NODE_COUNT_KEYS; - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
+#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
+
+//
+//
+//
+
+#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if ( SKC_PLACE_X == 1 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
+#define SKC_PLACE_EXPAND_I_LAST 0
+
+#elif ( SKC_PLACE_X == 2 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
+#define SKC_PLACE_EXPAND_I_LAST 1
+
+#elif ( SKC_PLACE_X == 4 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
+#define SKC_PLACE_EXPAND_I_LAST 3
+
+#elif ( SKC_PLACE_X == 8 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
+#define SKC_PLACE_EXPAND_I_LAST 7
+
+#elif ( SKC_PLACE_X == 16)
+#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
+#define SKC_PLACE_EXPAND_I_LAST 15
+#endif
+
+//
+// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
+// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
+//
+// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
+// KERNELS USE DIFFERENT SUBGROUP SIZES.
+//
+// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
+// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
+//
+// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
+// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
+// ONLY SUPPORT A SUBGROUP SIZE OF 16.
+//
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
+
+#define SKC_PLACE_STRIDE_H(L) (L)
+#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
+#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
+
+#define SKC_PLACE_STRIDE_H(L) (L)
+#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
+#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
+
+#endif
+
+//
+// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
+// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
+//
+
+#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
+
+#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+
+
+//
+// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
+//
+#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
+
+//
+// TTSK v2:
+//
+// 0 63
+// | TTSB ID | PREFIX | SPAN | X | Y |
+// +---------+--------+---------+-----+-----+
+// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
+//
+//
+// TTPK v2:
+//
+// 0 63
+// | TTPB ID | PREFIX | SPAN | X | Y |
+// +---------+--------+------+-----+-----+
+// | 27 | 1 (=1) | 12 | 12 | 12 |
+//
+//
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 18 | 7 | 7 |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 15 | 9 | 8 |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 27 | 1 | 1 | 18 | 9 | 8 |
+//
+
+union skc_subgroup_smem
+{
+ skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
+
+ struct {
+ struct {
+ skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+ skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+ } lo;
+
+ struct {
+ skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+ skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+ } hi;
+
+ // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
+ };
+
+};
+
+//
+// scatter scan max
+//
+static
+skc_int_v_t
+skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
+ skc_int_v_t const iss,
+ skc_int_v_t const ess)
+{
+ //
+ // prefix sums determine which lanes we're going to work on next
+ //
+ skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
+ skc_int_v_t const scratch_idx = max(ess,0);
+
+ //
+ // SIMT
+ //
+
+ //
+ // zero the volatile smem scratchpad using vector syntax
+ //
+ smem->scratch[get_sub_group_local_id()] = ( 0 );
+
+ //
+ // store source lane at starting lane
+ //
+ if (is_scratch_store) {
+ smem->scratch[scratch_idx] = get_sub_group_local_id();
+ }
+
+ //
+ // propagate lanes to right using max scan
+ //
+ skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
+ skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
+
+ return source;
+}
+
+//
+//
+//
+
+static
+skc_bool
+skc_xk_clip(union skc_tile_clip const * const tile_clip,
+ skc_ttxk_t * const xk)
+{
+ //
+ // clip the sk and pk keys
+ //
+ // if fully clipped then return false
+ //
+ // alternatively -- we can expand all these keys in place
+ //
+ // alternatively -- keep sk and pk keys segregated because sk
+ // represents the vast majority of keys and are easier to process.
+ // don't mess with the fastpath!
+ //
+ return false;
+}
+
+//
+//
+//
+
+static
+skc_ttck_t
+skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const sk_idx)
+{
+ skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
+ skc_uint const hi = smem->hi.sk[sk_idx];
+
+ skc_ttck_t ck;
+
+ ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+ // FIXME -- x and y should already be clipped and shifted
+ skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+ skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+ ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+ return ck;
+}
+
+static
+skc_ttck_t
+skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const pk_idx,
+ skc_uint const dx)
+{
+ skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
+ skc_uint const hi = smem->hi.pk[pk_idx];
+
+ skc_ttck_t ck;
+
+ ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+ // FIXME -- x and y should already be clipped and shifted
+ skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+ skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+ ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+ return ck;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const sk)
+{
+ //
+ // Pretty sure you can never ever have an sk count equal to 0
+ //
+ skc_uint ck_base = 0;
+
+ // last lane performs the block pool allocation with an atomic increment
+ if (get_sub_group_local_id() == 0) {
+ ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
+ }
+
+ // broadcast base to all lanes
+ ck_base = sub_group_broadcast(ck_base,0);
+
+ // convert sk keys to ck keys
+ for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
+ {
+ ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
+ }
+}
+
+//
+//
+//
+
+static
+skc_int
+skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
+ skc_uint const idx)
+{
+ skc_uint const lo = smem->lo.pk[idx];
+ skc_uint const hi = smem->hi.pk[idx];
+
+ skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
+ skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
+
+ return (span_lo | span_hi) + 1;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const pk)
+{
+ // bail out if pk queue is empty
+ if (pk == 0)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("%u\n",pk);
+#endif
+
+ //
+ // FIXME -- this nested loop iterates over the queue processing a
+ // subgroup of 64-bit keys at a time. This is probably not the most
+ // efficient approach so investigate how to store and iterate over a
+ // wider than subgroup (node-sized) queue of keys.
+ //
+
+ // round up so we work with full subgroups
+ skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
+ skc_uint ii = 0;
+
+ // nested loop that expands all ttpk keys
+#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
+ for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
+#endif
+ {
+ skc_uint idx = ii + get_sub_group_local_id();
+ skc_int span = 0;
+
+ // how many tiles does this ttpk span?
+ if (idx < pk)
+ span = skc_ttpk_get_span(smem,idx);
+
+ // we need inclusive, exclusive and total
+ skc_int iss = sub_group_scan_inclusive_add(span);
+ skc_int ess = iss - span;
+ skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
+
+ // printf("%u : %u\n",span,iss);
+ // continue;
+
+ // atomically allocate space for the pk keys
+ skc_uint ck_base = 0;
+
+ // last lane performs the block pool allocation with an atomic increment
+ if (get_sub_group_local_id() == 0) {
+ ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
+ }
+
+ // broadcast atomically allocated extent base to all lanes
+ skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
+
+ //
+ // FIXME -- this loop would probably be faster if the ttpk keys
+ // were held in registers and accessed with shuffles instead of
+ // SMEM loads
+ //
+
+ //
+ // loop until there are no more expanded pk keys
+ //
+ while (true)
+ {
+ skc_int const source = skc_scatter_scan_max(smem,iss,ess);
+ skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
+
+ // store valid ck keys to gmem
+ if (get_sub_group_local_id() < rem) {
+ ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
+ }
+
+ // decrement remainder
+ rem -= SKC_PLACE_SUBGROUP_SIZE;
+
+ if (rem <= 0)
+ break;
+
+ // increment/decrement indices
+ ck_idx += SKC_PLACE_SUBGROUP_SIZE;
+ iss -= SKC_PLACE_SUBGROUP_SIZE;
+ ess -= SKC_PLACE_SUBGROUP_SIZE;
+ }
+ }
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_ballot(skc_uint * const xk, skc_uint const is_xk)
+{
+#if 0
+ //
+ // FIXME -- when available, this should use the idiom:
+ //
+ // ballot() + lane_mask_less_than_or_equal + popcount()
+ //
+ // Supported by:
+ //
+ // - Vulkan 1.1 / SPIR-V 1.3
+ // - CUDA
+ // - AVX2 (SSE*?)
+ //
+#else
+ //
+ // otherwise, emulate with an inclusive scan (yuk)
+ //
+ skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
+
+ skc_uint const xk_idx = *xk + prefix - is_xk;
+
+ *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
+
+#if 0
+ printf("< %3u >\n",xk_idx);
+#endif
+
+ return xk_idx;
+#endif
+}
+
+//
+//
+//
+__kernel
+SKC_PLACE_KERNEL_ATTRIBS
+void
+skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
+ __global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __global union skc_cmd_place const * const cmds,
+ __global skc_block_id_t * const map,
+ skc_uint4 const clip,
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+ __local union skc_subgroup_smem volatile smem[1];
+#else
+ __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
+ __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // This is a subgroup-centric kernel
+ //
+ // Which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+ // Test the raster's translated bounds against the composition's
+ // tile clip
+ //
+ // There are 3 cases:
+ //
+ // - the raster is completely clipped -> return
+ // - the raster is partially clipped -> all keys must clipped
+ // - the raster is not clipped -> no keys are tested
+ //
+ //
+ // There are at least 4 implementations of place and we want to
+ // special-case them as much as possible so that, at the least, the
+ // fastpath remains fast.
+ //
+ // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
+ //
+ // - implement CLIPPED + NO TRANSLATION path
+ //
+ // - implement NO CLIP + TRANSLATION path
+ //
+ // - implement CLIPPED + TRANSLATION path
+ //
+ //
+ // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
+ // 12:12:8 integer where:
+ //
+ // 12: ttsk
+ // 12: ttpk
+ // 8: /dev/null -- clipped or invalid key
+ //
+ // Three kinds of nodes in a raster's list:
+ //
+ // - the head node
+ // - an internal node
+ // - the final node
+ //
+
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+ skc_uint const cmd_idx = get_group_id(0);
+#else
+ skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ // load command
+ union skc_cmd_place const cmd = cmds[cmd_idx];
+
+ // get the raster header from the raster host id -- scalar
+ skc_block_id_t id = map[cmd.raster_h];
+
+ //
+ // load all of the head block ttxk keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_raster_node_elem const h##I = { \
+ .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
+ bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
+ };
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // load raster header counts -- we only need the "nodes" and "keys"
+ // words but the keys we loaded are doublewords.
+ //
+ // FIXME -- this can be made portable with compile-time macro expansion
+ //
+ skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+ skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+ //
+ //
+ //
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
+ nodes,keys, \
+ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
+ h##I.u32v2.hi,h##I.u32v2.lo, \
+ h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+ SKC_PLACE_EXPAND();
+#endif
+
+ //
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("place: %u / %u / %u\n",head_id,nodes,keys);
+ }
+#endif
+
+ {
+ //
+ // classify every key in the header
+ //
+ // keys: 0 is not a key / 1 is a key
+ // skpk: 0 is sk / 1 is pk
+ //
+ skc_uint bits_keys = 0;
+ skc_uint bits_skpk = 0;
+
+ //
+ // calculate bits_keys
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
+ if (idx < keys) { \
+ bits_keys |= (1u << I); \
+ } \
+ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
+ if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
+ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
+ bits_keys &= ~(1u << I); \
+ } \
+ } \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // blindly calculate bits_skpk
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+ //
+ // next pointer is last element of last row. save it now because
+ // this might be recognized as a subgroup-uniform/scalar.
+ //
+ id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+ //
+ // append SK keys first
+ //
+ skc_uint const bits_sk = bits_keys & ~bits_skpk;
+ skc_uint sk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint is_sk = (bits_sk >> I) & 1; \
+ skc_uint sk_idx = skc_ballot(&sk,is_sk); \
+ if (is_sk) { \
+ smem->lo.sk[sk_idx] = h##I.xk.lo; \
+ smem->hi.sk[sk_idx] = h##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // append PK keys next
+ //
+ skc_uint const bits_pk = bits_keys & bits_skpk;
+ skc_uint pk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint is_pk = (bits_pk >> I) & 1; \
+ skc_uint pk_idx = skc_ballot(&pk,is_pk); \
+ if (is_pk) { \
+ smem->lo.pk[pk_idx] = h##I.xk.lo; \
+ smem->hi.pk[pk_idx] = h##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2u * %2u\n",sk,pk);
+#endif
+ //
+ // flush the keys
+ //
+ skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+ skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+ }
+
+ //
+ // we're done if there was only a head node
+ //
+ if (nodes == 0)
+ return;
+
+ //
+ // decrement keys
+ //
+ keys -= SKC_RASTER_HEAD_COUNT_KEYS;
+
+ //
+ // otherwise, append keys in trailing nodes to smem
+ //
+ while (true)
+ {
+ //
+ // load all of the node block ttxk keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_raster_node_elem const n##I = { \
+ .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
+ bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
+ };
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
+ nodes,keys, \
+ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
+ n##I.u32v2.hi,n##I.u32v2.lo, \
+ n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+ SKC_PLACE_EXPAND();
+#endif
+
+ //
+ // classify every key in the header
+ //
+ // keys: 0 is not a key / 1 is a key
+ // skpk: 0 is sk / 1 is pk
+ //
+ skc_uint bits_keys = 0;
+ skc_uint bits_skpk = 0;
+
+ //
+ // calculate bits_keys
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
+ if (idx < keys) { \
+ bits_keys |= (1u << I); \
+ } \
+ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
+ if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
+ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
+ bits_keys &= ~(1u << I); \
+ } \
+ } \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // blindly calculate bits_skpk
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+ //
+ // next pointer is last element of last row. save it now because
+ // this might be recognized as a subgroup-uniform/scalar.
+ //
+ id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+ //
+ // append SK keys first
+ //
+ skc_uint const bits_sk = bits_keys & ~bits_skpk;
+ skc_uint sk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint is_sk = (bits_sk >> I) & 1; \
+ skc_uint sk_idx = skc_ballot(&sk,is_sk); \
+ if (is_sk) { \
+ smem->lo.sk[sk_idx] = n##I.xk.lo; \
+ smem->hi.sk[sk_idx] = n##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // append PK keys next
+ //
+ skc_uint const bits_pk = bits_keys & bits_skpk;
+ skc_uint pk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint is_pk = (bits_pk >> I) & 1; \
+ skc_uint pk_idx = skc_ballot(&pk,is_pk); \
+ if (is_pk) { \
+ smem->lo.pk[pk_idx] = n##I.xk.lo; \
+ smem->hi.pk[pk_idx] = n##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2u * %2u\n",sk,pk);
+#endif
+ //
+ // if total for either the sk or pk queue reaches the
+ // highwater mark then flush it to the extent
+ //
+ skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+ skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+
+ //
+ // if this was the last node then we're done
+ //
+ if (--nodes == 0)
+ return;
+
+ //
+ // otherwise decrement keys
+ //
+ keys -= SKC_RASTER_NODE_COUNT_KEYS;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/prefix.cl b/src/compute/skc/platforms/cl_12/kernels/prefix.cl index 960b6cf5ff..21a51694da 100644 --- a/src/compute/skc/prefix.cl +++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl @@ -1,1042 +1,1041 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block.h" -#include "raster.h" -#include "atomic_cl.h" -#include "macros.h" -#include "tile.h" - -// -// INPUT: -// -// TTRK (64-BIT COMPARE) -// -// 0 63 -// | TTSB ID | X | Y | COHORT ID | -// +---------+------+------+-----------+ -// | 27 | 12 | 12 | 13 | -// -// -// TTRK (32-BIT COMPARE) -// -// 0 63 -// | TTSB ID | N/A | X | Y | COHORT ID | -// +---------+-----+------+------+-----------+ -// | 27 | 5 | 12 | 12 | 8 | -// -// -// OUTPUT: -// -// TTSK v2: -// -// 0 63 -// | TTSB ID | PREFIX | N/A | X | Y | -// +---------+--------+------+----+----+ -// | 27 | 1 (=0) | 12 | 12 | 12 | -// -// -// TTPK v1: -// -// 0 63 -// | TTPB ID | ALL ZEROES | SPAN | X | Y | -// +---------+------------+------+-----+-----+ -// | 27 | 1 | 12 | 12 | 12 | -// -// -// TTPK v2: -// -// 0 63 -// | TTPB ID | PREFIX | SPAN | X | Y | -// +---------+--------+------+-----+-----+ -// | 27 | 1 (=1) | 12 | 12 | 12 | -// - -#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1) - -// -// smem accumulator -// - -union skc_subgroup_accum -{ - struct { - SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT]; - } atomic; - - struct { - skc_ttp_t ttp[SKC_TILE_HEIGHT]; - } aN; - - struct { - SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE]; - } vN; - - struct { - SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH]; - } zero; -}; - -// -// -// - -struct skc_subgroup_smem -{ - // prefix accumulator - union skc_subgroup_accum accum; -}; - -// -// -// - -static -skc_uint -skc_subgroup_lane() -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - return get_sub_group_local_id(); -#else - return 0; -#endif -} - -// -// -// - -static -SKC_PREFIX_TTS_V_BITFIELD -skc_tts_get_dy(skc_tts_v_t const ttsv) -{ - // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32] - SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY; - - return dy - (~ttsv >> 31); -} - -static -SKC_PREFIX_TTS_V_BITFIELD -skc_tts_get_py(skc_tts_v_t const ttsv) -{ - return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2); -} - -// -// -// - -static -void -skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v) -{ - // get "altitude" - SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v); - - // get the y pixel coordinate - SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v); - - // - // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid? - // - // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op - // - -#if 0 - if (tts_v != SKC_TTS_INVALID) - printf("< %08X = %u : %d >\n",tts_v,py,dy); -#endif - - // - // scatter-add the "altitude" to accumulator - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \ - } - -#else - // - // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS - // - // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C == SKC_TTS_INVALID) \ - return; \ - smem->accum.aN.ttp[py C] = dy C; -#endif - - SKC_PREFIX_TTS_VECTOR_INT_EXPAND(); -} - -// -// The implication here is that if our device configuration has a -// rectangular 1:2 tile then we need a block size of at least 2 -// subblocks. The subblock size of course needs to match the length of -// the smallest tile side. -// - -static -void -skc_accum_flush(__local struct skc_subgroup_smem * const smem, - __global skc_bp_elem_t * const bp_elems, - skc_block_id_t const pb_id) -{ - // load the ttp elements - SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()]; - skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); - -#if ( SKC_TILE_RATIO == 1 ) - - bp_elems[offset] = ttp_v; - -#elif ( SKC_TILE_RATIO == 2 ) - - vstore2(ttp_v,offset,bp_elems); - -#else - -#error("tile ratio greater than 2 not supported") - -#endif -} - -// -// -// - -static -void -skc_accum_reset(__local struct skc_subgroup_smem * const smem) -{ - for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++) - smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 ); -} - -// -// get next sk key -// - -static -skc_ttsk_s_t -skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v, - skc_uint * const sk_next, - skc_int * const rkpk_rem) -{ - // decrement count - *rkpk_rem -= 1; - -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT with subgroup support is easy - // - // SIMT without subgroup support can always emulate with smem - // -#if 0 - // - // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly - // broadcast a uint2 cast to a long. It was probably bad to do this - // anyway without a union wrapping the TTSK scalar type. - // - // Consider creating a union { ulong; uint2 } at a later date -- - // probably no need to ever do this unless it makes broadcast faster - // which is unlikely since it will probably be implemented as 2 - // 32-bit broadcasts. - // - // Additionally, the TTRK and TTXK key bitfield sizes are probably - // cast in stone and we aren't going to change them no matter - // architecture we're on. - // - skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++); -#else - skc_ttsk_s_t sk_s; - - sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next); - sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next); - *sk_next += 1; -#endif - -#else - // - // SIMD will always grab component .s0 and then rotate the vector - // - sk_s = ( sk_v->s0 ); - - skc_ttsk_v_rotate_down(sk_v); - -#endif - - return sk_s; -} - -// -// -// - -static -skc_raster_yx_s -skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next) -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT with subgroup support is easy - // - // SIMT without subgroup support can always emulate with smem - // - skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next); - -#else - // - // SIMD will always grab component .s0 and then rotate the vector - // - skc_raster_yx_s const yx_s = ( sk_v->s0.hi ); - -#endif - - return yx_s; -} - -// -// mask off ttsb id -// - -static -skc_block_id_s_t -skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s) -{ - return ( sk_s->lo & SKC_TTXK_LO_MASK_ID ); -} - -// -// load tts_v as early as possible -// - -static -skc_tts_v_t -skc_load_tts(__global skc_bp_elem_t * const bp_elems, - skc_block_id_s_t const sb_id) -{ - return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] ); -} - -// -// massage ttrk keys into ttsk keys -// - -static -void -skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v) -{ - sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits - sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits -} - -// -// replenish ttsk keys -// - -static -void -skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v, - skc_uint * const sk_next, - skc_uint * const rks_next, - __global skc_ttrk_e_t const * const rks) -{ - // if there are still keys available then return - if (*sk_next < SKC_PREFIX_TTXK_V_SIZE) - return; - - // - // otherwise, replenish sk_v - // - // NOTE NOTE NOTE -- we are assuming rks[] extent size is always - // divisible by TTXK_V_SIZE and therefore loading some keys from the - // next raster is OK. - // - *sk_next = 0; - *rks_next += SKC_PREFIX_SUBGROUP_SIZE; - *sk_v = rks[*rks_next]; - -#if 0 - printf("* %08X ( %3u, %3u )\n", - sk_v->hi, - (sk_v->hi >> 12) & 0xFFF, - (sk_v->hi ) & 0xFFF); -#endif - - skc_ttrk_to_ttsk(sk_v); - -#if 0 - printf("! %08X ( %3u, %3u )\n", - sk_v->hi, - (sk_v->hi >> 20) & 0xFFF, - (sk_v->hi >> 8) & 0xFFF); -#endif -} - -// -// replenish block ids -// -// note that you can't overrun the block id pool since it's a ring -// - -static -void -skc_blocks_replenish(skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) - -{ - *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE; - *blocks = bp_ids[*blocks_idx & bp_mask]; - *blocks_next = 0; - -#if 0 - printf("replenish blocks: %u\n",*blocks); -#endif -} - -// -// -// - -static -skc_block_id_t -skc_blocks_get_next(skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // replenish? - if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE) - { - skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - } - -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); - -#else - // - // SIMD - // - skc_block_id_t id = blocks->s0; - - skc_shuffle_down_1(*blocks); - -#endif - - *blocks_next += 1; - - return id; -} - -// -// subblock allocator -// - -#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) - -static -skc_block_id_t -skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks, - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - } - - skc_block_id_t const pb_id = *subblocks; - - *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks - - return pb_id; -} - -#endif - -// -// append a ttsk key to the work-in-progress node -// - -static -void -skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s, - - skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - skc_uint * const xk_v_idx, - __global skc_bp_elem_t * const bp_elems, - - skc_int const rkpk_rem, - - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, - __global skc_block_id_t const * const bp_ids) -{ - // - // Append an sk key to the in-register xk_v vector - // - // If the work-in-progress node in gmem will only have room for one - // more key then: - // - // - if this was the final SK then write out xk_v and exit - // - // - otherwise, acquire a block id, link it, write out xk_v, - // prepare new node - // - // Note that this does *not* try to squeeze in a final key into the - // next node slot. This optimization isn't worth the added - // down-pipeline complexity. - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) - { - *xk_v = *sk_s; - } - - *xk_v_next += 1; - - // are there more keys coming? - if (rkpk_rem > 0) - { - // is the node almost full? - if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) - { - skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - - if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) - { - xk_v->lo = id; - xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary - } - - // store xk_v (uint2) to bp (uint) - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // update node elem idx - *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // reset node count - *xk_v_next = 0; - } - // is xk_v full? - else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) - { - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // increment node elem idx - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - } - } - else - { - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) - { - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - - bp_elems[*xk_v_idx] = SKC_UINT_MAX; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; - } - } - -#else - // - // SIMD - // - -#endif -} - -// -// -// - -static -skc_ttpk_s_t -skc_ttpk_create(skc_raster_yx_s const yx_prev, - skc_raster_yx_s const yx_next, - skc_block_id_t const pb_id) -{ - // - yx_prev is already incremented by one - // - yx_span is already shifted up at hi.x - skc_uint const yx_span = yx_next - yx_prev; - - skc_ttpk_s_t pk; - - // turn on prefix bit | shift span bits upward - pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN); - - // shift down high span bits | yx of tile - pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("* %08v2X : %u\n",pk,yx_span); -#endif - - return pk; -} - -// -// append a ttpk key to the work-in-progress node -// - -static -void -skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s, - - skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - skc_uint * const xk_v_idx, - __global skc_bp_elem_t * const bp_elems, - - skc_uint * const blocks_next, - skc_uint * const blocks_idx, - skc_block_id_v_t * const blocks, - skc_uint const bp_mask, - __global skc_block_id_t const * const bp_ids) -{ - // - // append a pk key to the in-register xk_v vector - // - // if the work-in-progress node in gmem will only have room for one - // more key then: - // - // - if this was the final SK then write out xk_v and exit - // - // - otherwise, acquire a block id, link it, write out xk_v, - // prepare new node - // -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) - { - *xk_v = *pk_s; - } - - *xk_v_next += 1; - - // is the node almost full? - if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) - { - skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); - - if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) - { - xk_v->lo = id; - xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary - } - - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // update node elem idx - *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // reset node count - *xk_v_next = 0; - } - // is xk_v full? - else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) - { - // store xk_v to bp - bp_elems[*xk_v_idx ] = xk_v->lo; - bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; -#if 0 - printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v); -#endif - // reinitialize xk_v - xk_v->lo = SKC_UINT_MAX; - xk_v->hi = SKC_UINT_MAX; - - // increment node elem idx - *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - } - -#else - // - // SIMD - // -#endif -} - -// -// append the first 3 fields of meta info to the raster header -// - -static -void -skc_node_v_init_header(skc_ttxk_v_t * const xk_v, - skc_uint * const xk_v_next, - union skc_raster_cohort_meta_out const * const meta) -{ -#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - if (get_sub_group_local_id() < 2) - { - *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi; - } - -#if 0 - if (get_sub_group_local_id() == 0) - printf("header: %08v4X\n",meta->u32v4); -#endif - - // - // increment counter: uint4 + uint4 = uint2 x 4 - // - *xk_v_next = 2 + 2; // +2 for unitialized bounds - -#else - // - // SIMD - // - -#endif -} - -// -// -// - -__kernel -SKC_PREFIX_KERNEL_ATTRIBS -void -skc_kernel_prefix(__global skc_uint const * const bp_atomics, - __global skc_block_id_t const * const bp_ids, - __global skc_bp_elem_t * const bp_elems, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_ttrk_e_t const * const rks, - __global skc_block_id_t * const map, - __global skc_uint const * const metas, - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem smem[1]; -#else - __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id(); -#endif - - // - // where is this subgroup in the grid? - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const sgi = get_group_id(0); -#else - skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - skc_uint const sgl = get_sub_group_local_id(); - - // - // return if this subgroup is excess - // -#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 ) - if (sgi >= count) - return; -#endif - - // - // get meta info for this subgroup's raster - // - union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) }; - skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("%3u : %5u / %5u / %5u / %5u / %u\n", - sgi, - meta.blocks, - meta.offset, - meta.nodes, - meta.keys, - reads); -#endif - - // - // preload blocks -- align on subgroup - // - skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); - skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask]; - skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK); - - // - // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset - // - skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); - - // - // initialize raster header -- assumes block is greater than 8 words (4 doublewords) - // - skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX }; - skc_uint xk_v_next; - - skc_node_v_init_header(&xk_v,&xk_v_next,&meta); - - // - // no keys -- this is an empty raster! - // - if (meta.keys == 0) - { - bp_elems[xk_v_idx ] = xk_v.lo; - bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi; - - while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) - { - xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; - - bp_elems[xk_v_idx] = SKC_UINT_MAX; - bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; - } - - return; - } - - // - // load TTRK keys and in-place convert to TTSK keys - // - skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); - skc_ttsk_v_t sk_v = rks[rks_next]; - skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK); - skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys - -#if 0 - printf("* %08X ( %3u, %3u )\n", - sk_v.hi, - (sk_v.hi >> 12) & 0xFFF, - (sk_v.hi ) & 0xFFF); -#endif - - skc_ttrk_to_ttsk(&sk_v); - -#if 0 - printf("! %08X ( %3u, %3u )\n", - sk_v.hi, - (sk_v.hi >> 20) & 0xFFF, - (sk_v.hi >> 8) & 0xFFF); -#endif - - // - // subblocks - // -#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) - skc_block_id_t subblocks = 0; -#endif - - // - // begin "scan" of tiles - // - skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next); - - // - // zero the accumulator - // - skc_accum_reset(smem); - - while (true) - { - // get next rk key - skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem); - - // load ttsb id - skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s); - - // load tts_v transaction "in flight" as early as possible - skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id); - -#if 0 - printf("{ %08X }\n",tts_v); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("[ %d, %X ]\n",rkpk_rem,sb_id); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF); -#endif - - // - // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF - // TIME AND SIMD'IZED - // - - // if yx's don't match then we're either issuing a ttpk or - // resetting the accumulator - if (sk_s.hi != yx_prev) - { - // if yx_next.y == yx_last.y then x changed - if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0) - { - // - // if the tile is not square then it's ratio is 1:2 - // -#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 - skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks, - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); -#else - skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); -#endif - - // flush accumulated ttp vector to block/subblock at ttpb_id - skc_accum_flush(smem,bp_elems,pb_id); - -#if 0 - if (get_sub_group_local_id() == 0) - { - printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n", - pb_id, - (yx_prev >> SKC_TTXK_HI_OFFSET_Y), - (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF, - (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF, - (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF); - } -#endif - - // - // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP - // - rkpk_rem -= 1; - - // create the pk - skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id); - - // append pk key to xk buffer - skc_node_v_append_pk(&pk_s, - - &xk_v, - &xk_v_next, - &xk_v_idx, - bp_elems, - - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); - } - else if (rkpk_rem > 0) // we're starting a new tile row - { - skc_accum_reset(smem); - } - } - - // - // append sk key to node_v - // - // if rkpk_rem is zero then return from kernel - // - skc_node_v_append_sk(&sk_s, - - &xk_v, - &xk_v_next, - &xk_v_idx, - bp_elems, - - rkpk_rem, - - &blocks_next, - &blocks_idx, - &blocks, - bp_mask, - bp_ids); - - // we're done if no more sk keys - if (rkpk_rem == 0) - break; - - // move to new tile - yx_prev = sk_s.hi; - - // scatter tts values into accumulator - skc_accum_scatter(smem,tts_v); - - // replenish sk keys - skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks); - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+// INPUT:
+//
+// TTRK (64-BIT COMPARE)
+//
+// 0 63
+// | TTSB ID | X | Y | COHORT ID |
+// +---------+------+------+-----------+
+// | 27 | 12 | 12 | 13 |
+//
+//
+// TTRK (32-BIT COMPARE)
+//
+// 0 63
+// | TTSB ID | N/A | X | Y | COHORT ID |
+// +---------+-----+------+------+-----------+
+// | 27 | 5 | 12 | 12 | 8 |
+//
+//
+// OUTPUT:
+//
+// TTSK v2:
+//
+// 0 63
+// | TTSB ID | PREFIX | N/A | X | Y |
+// +---------+--------+------+----+----+
+// | 27 | 1 (=0) | 12 | 12 | 12 |
+//
+//
+// TTPK v1:
+//
+// 0 63
+// | TTPB ID | ALL ZEROES | SPAN | X | Y |
+// +---------+------------+------+-----+-----+
+// | 27 | 1 | 12 | 12 | 12 |
+//
+//
+// TTPK v2:
+//
+// 0 63
+// | TTPB ID | PREFIX | SPAN | X | Y |
+// +---------+--------+------+-----+-----+
+// | 27 | 1 (=1) | 12 | 12 | 12 |
+//
+
+#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1)
+
+//
+// smem accumulator
+//
+
+union skc_subgroup_accum
+{
+ struct {
+ SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT];
+ } atomic;
+
+ struct {
+ skc_ttp_t ttp[SKC_TILE_HEIGHT];
+ } aN;
+
+ struct {
+ SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE];
+ } vN;
+
+ struct {
+ SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
+ } zero;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+ // prefix accumulator
+ union skc_subgroup_accum accum;
+};
+
+//
+//
+//
+
+static
+skc_uint
+skc_subgroup_lane()
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ return get_sub_group_local_id();
+#else
+ return 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_dy(skc_tts_v_t const ttsv)
+{
+ // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
+ SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
+
+ return dy - (~ttsv >> 31);
+}
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_py(skc_tts_v_t const ttsv)
+{
+ return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
+{
+ // get "altitude"
+ SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
+
+ // get the y pixel coordinate
+ SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
+
+ //
+ // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
+ //
+ // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
+ //
+
+#if 0
+ if (tts_v != SKC_TTS_INVALID)
+ printf("< %08X = %u : %d >\n",tts_v,py,dy);
+#endif
+
+ //
+ // scatter-add the "altitude" to accumulator
+ //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (tts_v C != SKC_TTS_INVALID) { \
+ SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
+ }
+
+#else
+ //
+ // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
+ //
+ // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (tts_v C == SKC_TTS_INVALID) \
+ return; \
+ smem->accum.aN.ttp[py C] = dy C;
+#endif
+
+ SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
+}
+
+//
+// The implication here is that if our device configuration has a
+// rectangular 1:2 tile then we need a block size of at least 2
+// subblocks. The subblock size of course needs to match the length of
+// the smallest tile side.
+//
+
+static
+void
+skc_accum_flush(__local struct skc_subgroup_smem * const smem,
+ __global skc_bp_elem_t * const bp_elems,
+ skc_block_id_t const pb_id)
+{
+ // load the ttp elements
+ SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()];
+ skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+
+#if ( SKC_TILE_RATIO == 1 )
+
+ bp_elems[offset] = ttp_v;
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+ vstore2(ttp_v,offset,bp_elems);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_reset(__local struct skc_subgroup_smem * const smem)
+{
+ for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
+ smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// get next sk key
+//
+
+static
+skc_ttsk_s_t
+skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
+ skc_uint * const sk_next,
+ skc_int * const rkpk_rem)
+{
+ // decrement count
+ *rkpk_rem -= 1;
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT with subgroup support is easy
+ //
+ // SIMT without subgroup support can always emulate with smem
+ //
+#if 0
+ //
+ // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
+ // broadcast a uint2 cast to a long. It was probably bad to do this
+ // anyway without a union wrapping the TTSK scalar type.
+ //
+ // Consider creating a union { ulong; uint2 } at a later date --
+ // probably no need to ever do this unless it makes broadcast faster
+ // which is unlikely since it will probably be implemented as 2
+ // 32-bit broadcasts.
+ //
+ // Additionally, the TTRK and TTXK key bitfield sizes are probably
+ // cast in stone and we aren't going to change them no matter
+ // architecture we're on.
+ //
+ skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
+#else
+ skc_ttsk_s_t sk_s;
+
+ sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next);
+ sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next);
+ *sk_next += 1;
+#endif
+
+#else
+ //
+ // SIMD will always grab component .s0 and then rotate the vector
+ //
+ sk_s = ( sk_v->s0 );
+
+ skc_ttsk_v_rotate_down(sk_v);
+
+#endif
+
+ return sk_s;
+}
+
+//
+//
+//
+
+static
+skc_raster_yx_s
+skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT with subgroup support is easy
+ //
+ // SIMT without subgroup support can always emulate with smem
+ //
+ skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
+
+#else
+ //
+ // SIMD will always grab component .s0 and then rotate the vector
+ //
+ skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
+
+#endif
+
+ return yx_s;
+}
+
+//
+// mask off ttsb id
+//
+
+static
+skc_block_id_s_t
+skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
+{
+ return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
+}
+
+//
+// load tts_v as early as possible
+//
+
+static
+skc_tts_v_t
+skc_load_tts(__global skc_bp_elem_t * const bp_elems,
+ skc_block_id_s_t const sb_id)
+{
+ return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
+}
+
+//
+// massage ttrk keys into ttsk keys
+//
+
+static
+void
+skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
+{
+ sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits
+ sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
+}
+
+//
+// replenish ttsk keys
+//
+
+static
+void
+skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v,
+ skc_uint * const sk_next,
+ skc_uint * const rks_next,
+ __global skc_ttrk_e_t const * const rks)
+{
+ // if there are still keys available then return
+ if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
+ return;
+
+ //
+ // otherwise, replenish sk_v
+ //
+ // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
+ // divisible by TTXK_V_SIZE and therefore loading some keys from the
+ // next raster is OK.
+ //
+ *sk_next = 0;
+ *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
+ *sk_v = rks[*rks_next];
+
+#if 0
+ printf("* %08X ( %3u, %3u )\n",
+ sk_v->hi,
+ (sk_v->hi >> 12) & 0xFFF,
+ (sk_v->hi ) & 0xFFF);
+#endif
+
+ skc_ttrk_to_ttsk(sk_v);
+
+#if 0
+ printf("! %08X ( %3u, %3u )\n",
+ sk_v->hi,
+ (sk_v->hi >> 20) & 0xFFF,
+ (sk_v->hi >> 8) & 0xFFF);
+#endif
+}
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint * const blocks_next,
+ skc_uint * const blocks_idx,
+ skc_block_id_v_t * const blocks,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+
+{
+ *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
+ *blocks = bp_ids[*blocks_idx & bp_mask];
+ *blocks_next = 0;
+
+#if 0
+ printf("replenish blocks: %u\n",*blocks);
+#endif
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint * const blocks_next,
+ skc_uint * const blocks_idx,
+ skc_block_id_v_t * const blocks,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+{
+ // replenish?
+ if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
+ {
+ skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+ }
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT
+ //
+ skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+ //
+ // SIMD
+ //
+ skc_block_id_t id = blocks->s0;
+
+ skc_shuffle_down_1(*blocks);
+
+#endif
+
+ *blocks_next += 1;
+
+ return id;
+}
+
+//
+// subblock allocator
+//
+
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+
+static
+skc_block_id_t
+skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks,
+ skc_uint * const blocks_next,
+ skc_uint * const blocks_idx,
+ skc_block_id_v_t * const blocks,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+{
+ if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+ {
+ *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+ }
+
+ skc_block_id_t const pb_id = *subblocks;
+
+ *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
+
+ return pb_id;
+}
+
+#endif
+
+//
+// append a ttsk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s,
+
+ skc_ttxk_v_t * const xk_v,
+ skc_uint * const xk_v_next,
+ skc_uint * const xk_v_idx,
+ __global skc_bp_elem_t * const bp_elems,
+
+ skc_int const rkpk_rem,
+
+ skc_uint * const blocks_next,
+ skc_uint * const blocks_idx,
+ skc_block_id_v_t * const blocks,
+ skc_uint const bp_mask,
+ __global skc_block_id_t const * const bp_ids)
+{
+ //
+ // Append an sk key to the in-register xk_v vector
+ //
+ // If the work-in-progress node in gmem will only have room for one
+ // more key then:
+ //
+ // - if this was the final SK then write out xk_v and exit
+ //
+ // - otherwise, acquire a block id, link it, write out xk_v,
+ // prepare new node
+ //
+ // Note that this does *not* try to squeeze in a final key into the
+ // next node slot. This optimization isn't worth the added
+ // down-pipeline complexity.
+ //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT
+ //
+ if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+ {
+ *xk_v = *sk_s;
+ }
+
+ *xk_v_next += 1;
+
+ // are there more keys coming?
+ if (rkpk_rem > 0)
+ {
+ // is the node almost full?
+ if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+ {
+ skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+ if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+ {
+ xk_v->lo = id;
+ xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+ }
+
+ // store xk_v (uint2) to bp (uint)
+ bp_elems[*xk_v_idx ] = xk_v->lo;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+ printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+ // reinitialize xk_v
+ xk_v->lo = SKC_UINT_MAX;
+ xk_v->hi = SKC_UINT_MAX;
+
+ // update node elem idx
+ *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+ // reset node count
+ *xk_v_next = 0;
+ }
+ // is xk_v full?
+ else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+ {
+ // store xk_v to bp
+ bp_elems[*xk_v_idx ] = xk_v->lo;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+ printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+ // reinitialize xk_v
+ xk_v->lo = SKC_UINT_MAX;
+ xk_v->hi = SKC_UINT_MAX;
+
+ // increment node elem idx
+ *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+ }
+ }
+ else
+ {
+ bp_elems[*xk_v_idx ] = xk_v->lo;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+ printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+ while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+ {
+ *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+ bp_elems[*xk_v_idx] = SKC_UINT_MAX;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+ }
+ }
+
+#else
+ //
+ // SIMD
+ //
+
+#endif
+}
+
+//
+//
+//
+
+static
+skc_ttpk_s_t
+skc_ttpk_create(skc_raster_yx_s const yx_prev,
+ skc_raster_yx_s const yx_next,
+ skc_block_id_t const pb_id)
+{
+ // - yx_prev is already incremented by one
+ // - yx_span is already shifted up at hi.x
+ skc_uint const yx_span = yx_next - yx_prev;
+
+ skc_ttpk_s_t pk;
+
+ // turn on prefix bit | shift span bits upward
+ pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
+
+ // shift down high span bits | yx of tile
+ pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("* %08v2X : %u\n",pk,yx_span);
+#endif
+
+ return pk;
+}
+
+//
+// append a ttpk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s,
+
+ skc_ttxk_v_t * const xk_v,
+ skc_uint * const xk_v_next,
+ skc_uint * const xk_v_idx,
+ __global skc_bp_elem_t * const bp_elems,
+
+ skc_uint * const blocks_next,
+ skc_uint * const blocks_idx,
+ skc_block_id_v_t * const blocks,
+ skc_uint const bp_mask,
+ __global skc_block_id_t const * const bp_ids)
+{
+ //
+ // append a pk key to the in-register xk_v vector
+ //
+ // if the work-in-progress node in gmem will only have room for one
+ // more key then:
+ //
+ // - if this was the final SK then write out xk_v and exit
+ //
+ // - otherwise, acquire a block id, link it, write out xk_v,
+ // prepare new node
+ //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT
+ //
+ if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+ {
+ *xk_v = *pk_s;
+ }
+
+ *xk_v_next += 1;
+
+ // is the node almost full?
+ if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+ {
+ skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+ if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+ {
+ xk_v->lo = id;
+ xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+ }
+
+ // store xk_v to bp
+ bp_elems[*xk_v_idx ] = xk_v->lo;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+ printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+ // reinitialize xk_v
+ xk_v->lo = SKC_UINT_MAX;
+ xk_v->hi = SKC_UINT_MAX;
+
+ // update node elem idx
+ *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+ // reset node count
+ *xk_v_next = 0;
+ }
+ // is xk_v full?
+ else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+ {
+ // store xk_v to bp
+ bp_elems[*xk_v_idx ] = xk_v->lo;
+ bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+ printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+ // reinitialize xk_v
+ xk_v->lo = SKC_UINT_MAX;
+ xk_v->hi = SKC_UINT_MAX;
+
+ // increment node elem idx
+ *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+ }
+
+#else
+ //
+ // SIMD
+ //
+#endif
+}
+
+//
+// append the first 3 fields of meta info to the raster header
+//
+
+static
+void
+skc_node_v_init_header(skc_ttxk_v_t * const xk_v,
+ skc_uint * const xk_v_next,
+ union skc_raster_cohort_meta_out const * const meta)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT
+ //
+ if (get_sub_group_local_id() < 2)
+ {
+ *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
+ }
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("header: %08v4X\n",meta->u32v4);
+#endif
+
+ //
+ // increment counter: uint4 + uint4 = uint2 x 4
+ //
+ *xk_v_next = 2 + 2; // +2 for unitialized bounds
+
+#else
+ //
+ // SIMD
+ //
+
+#endif
+}
+
+//
+//
+//
+
+__kernel
+SKC_PREFIX_KERNEL_ATTRIBS
+void
+skc_kernel_prefix(__global skc_uint const * const bp_atomics,
+ __global skc_block_id_t const * const bp_ids,
+ __global skc_bp_elem_t * const bp_elems,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_ttrk_e_t const * const rks,
+ __global skc_block_id_t * const map,
+ __global skc_uint const * const metas,
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+ __local struct skc_subgroup_smem smem[1];
+#else
+ __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
+ __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
+#endif
+
+ //
+ // where is this subgroup in the grid?
+ //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+ skc_uint const sgi = get_group_id(0);
+#else
+ skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ skc_uint const sgl = get_sub_group_local_id();
+
+ //
+ // return if this subgroup is excess
+ //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
+ if (sgi >= count)
+ return;
+#endif
+
+ //
+ // get meta info for this subgroup's raster
+ //
+ union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) };
+ skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("%3u : %5u / %5u / %5u / %5u / %u\n",
+ sgi,
+ meta.blocks,
+ meta.offset,
+ meta.nodes,
+ meta.keys,
+ reads);
+#endif
+
+ //
+ // preload blocks -- align on subgroup
+ //
+ skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+ skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask];
+ skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK);
+
+ //
+ // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
+ //
+ skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+ //
+ // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
+ //
+ skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
+ skc_uint xk_v_next;
+
+ skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
+
+ //
+ // no keys -- this is an empty raster!
+ //
+ if (meta.keys == 0)
+ {
+ bp_elems[xk_v_idx ] = xk_v.lo;
+ bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
+
+ while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+ {
+ xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+ bp_elems[xk_v_idx] = SKC_UINT_MAX;
+ bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+ }
+
+ return;
+ }
+
+ //
+ // load TTRK keys and in-place convert to TTSK keys
+ //
+ skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+ skc_ttsk_v_t sk_v = rks[rks_next];
+ skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
+ skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys
+
+#if 0
+ printf("* %08X ( %3u, %3u )\n",
+ sk_v.hi,
+ (sk_v.hi >> 12) & 0xFFF,
+ (sk_v.hi ) & 0xFFF);
+#endif
+
+ skc_ttrk_to_ttsk(&sk_v);
+
+#if 0
+ printf("! %08X ( %3u, %3u )\n",
+ sk_v.hi,
+ (sk_v.hi >> 20) & 0xFFF,
+ (sk_v.hi >> 8) & 0xFFF);
+#endif
+
+ //
+ // subblocks
+ //
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+ skc_block_id_t subblocks = 0;
+#endif
+
+ //
+ // begin "scan" of tiles
+ //
+ skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
+
+ //
+ // zero the accumulator
+ //
+ skc_accum_reset(smem);
+
+ while (true)
+ {
+ // get next rk key
+ skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
+
+ // load ttsb id
+ skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
+
+ // load tts_v transaction "in flight" as early as possible
+ skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id);
+
+#if 0
+ printf("{ %08X }\n",tts_v);
+#endif
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("[ %d, %X ]\n",rkpk_rem,sb_id);
+#endif
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
+#endif
+
+ //
+ // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
+ // TIME AND SIMD'IZED
+ //
+
+ // if yx's don't match then we're either issuing a ttpk or
+ // resetting the accumulator
+ if (sk_s.hi != yx_prev)
+ {
+ // if yx_next.y == yx_last.y then x changed
+ if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
+ {
+ //
+ // if the tile is not square then it's ratio is 1:2
+ //
+#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
+ skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
+ &blocks_next,
+ &blocks_idx,
+ &blocks,
+ bp_mask,
+ bp_ids);
+#else
+ skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
+ &blocks_idx,
+ &blocks,
+ bp_mask,
+ bp_ids);
+#endif
+
+ // flush accumulated ttp vector to block/subblock at ttpb_id
+ skc_accum_flush(smem,bp_elems,pb_id);
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ {
+ printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
+ pb_id,
+ (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
+ (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
+ (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
+ (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
+ }
+#endif
+
+ //
+ // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
+ //
+ rkpk_rem -= 1;
+
+ // create the pk
+ skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
+
+ // append pk key to xk buffer
+ skc_node_v_append_pk(&pk_s,
+
+ &xk_v,
+ &xk_v_next,
+ &xk_v_idx,
+ bp_elems,
+
+ &blocks_next,
+ &blocks_idx,
+ &blocks,
+ bp_mask,
+ bp_ids);
+ }
+ else if (rkpk_rem > 0) // we're starting a new tile row
+ {
+ skc_accum_reset(smem);
+ }
+ }
+
+ //
+ // append sk key to node_v
+ //
+ // if rkpk_rem is zero then return from kernel
+ //
+ skc_node_v_append_sk(&sk_s,
+
+ &xk_v,
+ &xk_v_next,
+ &xk_v_idx,
+ bp_elems,
+
+ rkpk_rem,
+
+ &blocks_next,
+ &blocks_idx,
+ &blocks,
+ bp_mask,
+ bp_ids);
+
+ // we're done if no more sk keys
+ if (rkpk_rem == 0)
+ break;
+
+ // move to new tile
+ yx_prev = sk_s.hi;
+
+ // scatter tts values into accumulator
+ skc_accum_scatter(smem,tts_v);
+
+ // replenish sk keys
+ skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl index c9462ecff5..e622845d9c 100644 --- a/src/compute/skc/rasterize.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl @@ -1,3367 +1,3366 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block_pool_cl.h" - -#include "atomic_cl.h" -#include "common.h" -#include "tile.h" - -// #define SKC_ARCH_AVX2 -// #define SKC_RASTERIZE_SIMD_USES_SMEM - -#define PRINTF_ENABLE 0 -#define PRINTF_BLOCK_COUNT 0 - -// -// NOTE: -// -// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT -// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE -// -// NOTE: -// -// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. -// -// - -#if 0 // SKC_ARCH_AVX2 - -// #define SKC_RASTERIZE_SUBGROUP_SIZE 1 -// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 -// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 - -// #define SKC_TTXB_WORDS 8 - -// #define SKC_RASTERIZE_FLOAT float8 -// #define SKC_RASTERIZE_UINT uint8 -// #define SKC_RASTERIZE_INT int8 -// #define SKC_RASTERIZE_PREDICATE int8 - -// #define SKC_RASTERIZE_BIN_BLOCK uint16 -// #define SKC_RASTERIZE_BIN uint8 - -// #define SKC_RASTERIZE_POOL uint8 -// #define SKC_RASTERIZE_POOL_SCALE 6 - -// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 -// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 - -// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() - -#endif - -// -// SIMT -// - -#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE -#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE -#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) - -// -// -// - -#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) -#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) - -// -// -// - -#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } -#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } - -// -// -// - -#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) -#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) -#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) -#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) -#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) -#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) - -// -// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" -// -// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ -// -// Lerp in two fma/mad ops: -// -// t * b + ((-t) * a + a) -// -// Note: OpenCL documents mix() as being implemented as: -// -// a + (b - a) * t -// -// But this may be a native instruction on some devices. For example, -// on GEN9 there is an LRP "linear interoplation" opcode but it -// doesn't appear to support half floats. -// -// Feel free to toggle this option and then benchmark and inspect the -// generated code. We really want the double FMA to be generated when -// there isn't support for a LERP/MIX operation. -// - -#if 1 -#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) -#else -#define SKC_LERP(a,b,t) mix(a,b,t) -#endif - -// -// There is no integer MAD in OpenCL with "don't care" overflow -// semantics. -// -// FIXME -- verify if the platform needs explicit MAD operations even -// if a "--fastmath" option is available at compile time. It might -// make sense to explicitly use MAD calls if the platform requires it. -// - -#if 1 -#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) -#else -#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) -#endif - -// -// -// - -#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) - -// -// -// - -union skc_bp_elem -{ - skc_uint u32; - skc_tagged_block_id_t tag_id; - skc_float coord; -}; - -// -// -// - -struct skc_subgroup_smem -{ - // - // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) - struct { - union { - - skc_uint winner; - - struct { - skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; - } aN; - - struct { - SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; - } vN; - }; - } subgroup; -#endif - - // - // work-in-progress TTSB blocks and associated YX keys - // - union { - struct { - // FIXME -- some typedefs are valid here - skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; - skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - } aN; -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - struct { - SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; - SKC_RASTERIZE_BIN yx; - SKC_RASTERIZE_BIN id; - SKC_RASTERIZE_BIN count; - } vN; -#endif - } bin; -}; - -// -// -// - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) -#define skc_subgroup_lane() 0 -#else -#define skc_subgroup_lane() get_sub_group_local_id() -#endif - -// -// replenish block ids -// -// note that you can't overrun the block id pool since it's a ring -// - -static -void -skc_blocks_replenish(skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // - // get a new vector of block ids -- this is kind of a narrow - // allocation but subblocks help stretch out the pool. - // - // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids - // - skc_uint bp_idx = 0; - - if (skc_subgroup_lane() == 0) - { - bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, - SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads -#if 0 - printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); -#endif - } - - bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; - *blocks = bp_ids[bp_idx]; - *blocks_next = 0; -} - -// -// -// - -static -skc_block_id_t -skc_blocks_get_next(skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - // replenish? - if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) - { - skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); - } - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) - // - // SIMT - // - skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); - -#else - // - // SIMD - // - skc_block_id_t id = blocks->s0; - - skc_shuffle_down_1(*blocks); - -#endif - - *blocks_next += 1; - - return id; -} - -// -// subblock allocator -// - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - -static -skc_block_id_t -skc_subblocks_get_next(skc_block_id_t * const subblocks, - skc_uint * const blocks_next, - skc_block_id_v_t * const blocks, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids) -{ - if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); - } - - skc_block_id_t const sb_id = *subblocks; - - *subblocks += 1; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("= %u\n",sb_id); -#endif - - return sb_id; -} - - -#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks -#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks - -#else - -#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks -#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks - -#endif - -// -// -// - -static -skc_block_id_t -skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), - skc_uint * const blocks_next, - __global SKC_ATOMIC_UINT volatile * const bp_atomics, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const bp_ids, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - skc_uint const new_yx) -{ -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, - blocks_next, - blocks, - bp_atomics, - bp_mask, - bp_ids); -#else - skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, - blocks, - bp_atomics, - bp_mask, // pow2 modulo mask for block pool ring - bp_ids); -#endif - - if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) - { - sk_v->lo = new_id; - sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; -#if 0 - printf("@ ( %3u, %3u ) %u\n", - (new_yx >> 12) & 0xFFF, - (new_yx ) & 0xFFF, - new_id); -#endif - } - - *sk_v_next += 1; - - if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) - { - *sk_v_next = 0; - - skc_uint sk_idx = 0; - - if (skc_subgroup_lane() == 0) - { - sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); -#if 0 - printf("+ %u\n",sk_idx); -#endif - } - - sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) - if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) -#endif - { - sk_extent[sk_idx] = *sk_v; -#if 0 - printf("> %u : %v2u\n",sk_idx,*sk_v); -#endif - } - } - - return new_id; -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 0 + - // -- - // 01 - SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 012 + - // ---- - // 0123 - // 01 + - // ---- - // 0123 - // - SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 0123456 + - // -------- - // 01234567 - // 012345 + - // -------- - // 01234567 - // 0123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); - SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 0123456789abcde + - // ---------------- - // 0123456789abcdef - // 0123456789abcd + - // ---------------- - // 0123456789abcdef - // 0123456789ab + - // ---------------- - // 0123456789abcdef - // 01234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); - SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); - SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_add(v); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 0 + - // -- - // 01 - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 012 + - // ---- - // 0123 - // 01 + - // ---- - // 0123 - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 0123456 + - // -------- - // 01234567 - // 012345 + - // -------- - // 01234567 - // 0123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); - SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 0123456789abcde + - // ---------------- - // 0123456789abcdef - // 0123456789abcd + - // ---------------- - // 0123456789abcdef - // 0123456789ab + - // ---------------- - // 0123456789abcdef - // 01234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); - SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); - SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); - SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_add(v); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // Note that there isn't a built-in horizontal scan for vectors so - // we'll define some here for various widths. - // - // FIXME -- a scalar version might be faster so put in a - // compile-time switch to selection between implementations - // - -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - // 01 - // 00 max - // -- - // 01 - SKC_RASTERIZE_UINT const w = max(v.s00,v); - return w; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - // 0123 - // 0012 + - // ---- - // 0123 - // 0101 + - // ---- - // 0123 - // - SKC_RASTERIZE_UINT const w = max(v.s0012,v); - SKC_RASTERIZE_UINT const x = max(w.s0101,w); - return x; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - // 01234567 - // 00123456 + - // -------- - // 01234567 - // 01012345 + - // -------- - // 01234567 - // 01230123 + - // -------- - // 01234567 - // - SKC_RASTERIZE_UINT const w = max(v.s00123456,v); - SKC_RASTERIZE_UINT const x = max(w.s01012345,w); - SKC_RASTERIZE_UINT const y = max(x.s01230123,x); - return y; - -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - // 0123456789abcdef - // 00123456789abcde + - // ---------------- - // 0123456789abcdef - // 010123456789abcd + - // ---------------- - // 0123456789abcdef - // 01230123456789ab + - // ---------------- - // 0123456789abcdef - // 0123456701234567 + - // ---------------- - // 0123456789abcdef - // - SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); - SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); - SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); - SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); - return z; - -#endif - -#else - // - // SIMT - // - - return sub_group_scan_inclusive_max(v); - -#endif -} - -// -// -// - -static -float -skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return v.s1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return v.s3; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return v.s7; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return v.sf; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_UINT -skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return v.s1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return v.s3; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return v.s7; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return v.sf; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); - -#endif -} - -// -// -// - -static -float -skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#else - return v.s0; -#endif - -#else - // - // SIMT - // - return sub_group_broadcast(v,0); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, - SKC_RASTERIZE_UINT const i) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return v; -#else - return shuffle(v,i); -#endif - -#else - // - // SIMT - // - return intel_sub_group_shuffle(v,i); - -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous - SKC_RASTERIZE_FLOAT const c) // current -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - // FIXME -- there are alternative formulations here: - // - // Option 1: - // - // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) - // - // Option 2: - // - // p is a scalar - // t = c.rotate(+1) - // t.s0 = p; - // - // Option 3: ... - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return p; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return shuffle2(p,c,(uint2)(1,2)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return shuffle2(p,c,(uint4)(3,4,5,6)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); -#endif - -#else - // - // SIMT - // - return intel_sub_group_shuffle_up(p,c,1); - -#endif -} - -// -// -// - -static -bool -skc_is_lane_first() -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) - // - // SIMD - // - return true; -#else - // - // SIMT - // - return get_sub_group_local_id() == 0; -#endif -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_delta_offset() -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - return 1; -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); -#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) - return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); -#endif - -#else - // - // SIMT - // - return 1.0f + get_sub_group_local_id(); - -#endif - -} - -// -// -// - -static -int -skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - return any(p); -#else - // - // SIMT - // - return sub_group_any(p); -#endif -} - -// -// -// - -#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) - -void -skc_segment_next(__global union skc_bp_elem * const bp_elems, - skc_uint * const nodeword, - skc_block_id_t * const id) -{ - if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - { - if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) - { - *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; - } - - skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; - - *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - } -} - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) -{ - return native_sqrt(x * x + y * y); -} - -// -// Wang's Formula (1985) -// - -#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned - -#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) - -#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) -#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) - -#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) -#define SKC_WANG_SQRT(x) native_sqrt(x) - -// -// -// - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, - SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, - SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, - SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) -{ - // - // Return the number of evenly spaced (in the parametric sense) line - // segments that are guaranteed to be within "epsilon" error of the - // curve. - // - // We're then going to take multiples of the reciprocal of this - // number so that the segmentation can be distributed across the - // subgroup. - // - // Note, this can probably be slightly optimized per architecture - // but it's probably far from being a hotspot since it's all - // straight-line unpredicated code. - // - // The result is an integer ranging from [1.0,#segments] - // - // Note that even if all of the control points are coincident, the - // max(1.0f) will categorize this as a line of 1 segment. - // - // This is what we want! We want to convert cubics to lines as - // easily as possible and *then* cull lines that are either - // horizontal or zero length. - // - return max(1.0f, - ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * - SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), - fabs(t3x - 2.0f * t2x + t1x)), - max(fabs(t2y - 2.0f * t1y + t0y), - fabs(t3y - 2.0f * t2y + t1y)))))); -} - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, - SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, - SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) -{ - return max(1.0f, - ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * - SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x), - fabs(t2y - 2.0f * t1y + t0y))))); -} - -// -// rational curves -// - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_cubic_rat() -{ - return 0.0f; -} - -static -SKC_RASTERIZE_FLOAT -skc_wangs_formula_quad_rat() -{ - return 0.0f; -} - -// -// flush any work-in-progress blocks and return unused block ids -// - -static -void -skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_v_t * const blocks, - skc_uint const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem) -{ - // - // flush non-empty bins - // - // FIXME -- accelerate this iteration/search with a subgroup operation - // - for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++) - { - if (smem->bin.aN.count[ii] > 0) - { - skc_block_id_v_t const id = smem->bin.aN.id[ii]; - skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; -#if 0 - printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); -#endif - bp_elems[idx].u32 = tts; - } - - // - // FIXME -- vectorize with vstoreN() - // - } - - // - // return remaining block ids back to the pool - // - skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; - - if (blocks_rem > 0) - { - skc_uint bp_idx = 0; - - if (skc_subgroup_lane() == 0) - { - bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); - -#if 0 - printf("r-: %8u + %u\n",bp_idx,blocks_rem); -#endif - } - - bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; - - if (skc_subgroup_lane() >= blocks_next) - { - bp_ids[bp_idx] = *blocks; - } - } - - // - // flush work-in-progress ryx keys - // - if (sk_v_next > 0) - { - skc_uint sk_idx = 0; - - if (skc_subgroup_lane() == 0) - { - sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); -#if 0 - printf("* %u\n",sk_idx); -#endif - } - - sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); - - if (skc_subgroup_lane() < sk_v_next) - { - sk_extent[sk_idx] = *sk_v; - } - } -} - -// -// If there are lanes that were unable to append to a bin because -// their hashes collided with a bin's current ryx key then those bins -// must be ejected. -// -// Note that we do not eject "full" bins because lazily waiting for a -// collision results in simpler code. -// - -static -void -skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_t * const subblocks, - skc_block_id_v_t * const blocks, - skc_uint * const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_UINT const hash, - SKC_RASTERIZE_UINT const yx, - SKC_RASTERIZE_PREDICATE is_collision) // pass by value -{ -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - - // - // FIXME -- this code is now stale with the changes to the - // subblock/block allocation strategy - // - - // - // get local TTSB ID queue count - // - skc_uint ttsb_id_count = smem->pool.count; // scalar - - // init hash bit mask - skc_uint component_mask = 0; - - for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++) - { - // if no collision continue - if (((int*)&is_collision)[cc] == 0) - continue; - - uint const winner = ((uint*)&hash)[cc]; - uint const component_bit = 1u << winner; - - // if already processed this hash then continue - if (component_mask & component_bit) - continue; - - // update component mask - component_mask |= component_bit; - - // - // new winner requires ejecting the old TTSB - // - if (smem->bin.aN.count[winner] > 0) - { - skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - - bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; - } - - // - // ensure there is at least one TTSK and TTSB ID - // - if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) - { - // - // update remaining count - // - ttsb_id_count = 0; - - // - // flush accumulated ttsk_ryx keys - // - uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE - (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count - -#if 0 - printf("# %u\n",idx); -#endif - - for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) - { - ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii); - } - - // - // allocate more ttsb ids from pool - // - uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads - - for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) - smem->pool.aN.id[ii] = bp_ids[id + ii]; - } - - // - // invalidate the winning block - // - - // - // update bin with winning yx, new ttsb id and zero count - // - // all lanes are loading/storing from/to the same index - // - smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); - smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; - smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; - smem->bin.aN.count[winner] = 0; - - // - // update count - // - ttsb_id_count += 1; - } - - // - // save count - // - smem->pool.count = ttsb_id_count; - -#else - // - // SIMT - // - - do { - // - // only one lane will win! - // - if (is_collision) - smem->subgroup.winner = hash; - - barrier(CLK_LOCAL_MEM_FENCE); - - // - // which bin is being ejected? - // - skc_uint const winner = smem->subgroup.winner; - - // - // which colliding hash is taking over the bin? - // - SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); - - // - // all lanes with the same hash will try to store but only one - // lane will win - // - if (is_winner) - smem->subgroup.winner = yx; - - barrier(CLK_LOCAL_MEM_FENCE); - - // - // flush this block to the pool - // - if (smem->bin.aN.count[winner] > 0) - { - skc_block_id_v_t const id = smem->bin.aN.id[winner]; - skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; -#if 0 - printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); -#endif - bp_elems[idx].u32 = tts; - } - - // - // append new ttsk - // - skc_uint const new_yx = smem->subgroup.winner; - skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), - blocks_next, - bp_atomics, - bp_mask, // pow2 modulo mask for block pool ring - bp_ids, - cohort_atomics, - sk_v, - sk_v_next, - sk_extent, - new_yx); - -#if 0 - if (get_sub_group_local_id() == 0) { - printf(">>> %9u\n",new_id); - } -#endif - - // - // update bin with winning yx, new ttsb id and zero count - // - smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; - smem->bin.aN.yx [winner] = new_yx; - smem->bin.aN.id [winner] = new_id; - smem->bin.aN.count[winner] = 0; - - // - // remove all lanes matching this hash - // - is_collision = is_collision && !is_winner; - - // - // exit if nothing left to do - // - } while (sub_group_any(is_collision)); - -#endif -} - -// -// scatter scan max -// -static -SKC_RASTERIZE_UINT -skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_FLOAT const iss, - SKC_RASTERIZE_FLOAT const ess) -{ - // - // prefix sums determine which lanes we're going to work on next - // - SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); - SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // -#ifdef SKC_RASTERIZE_SIMD_USES_SMEM - // - // SIMD APPROACH 1: SIMT'ISH - // - - // zero the volatile smem scratchpad using vector syntax - smem->subgroup.vN.scratch[0] = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_scratch_store C) \ - smem->subgroup.aN.scratch[scratch_idx C] = I; - - SKC_RASTERIZE_VECTOR_EXPAND(); - - // propagate lanes to right using max scan - SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; - SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); - -#else - // - // SIMD APPROACH 2: SCALAR'ISH - // - - SKC_RASTERIZE_UINT source = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_scratch_store C) \ - ((uint *)&source)[scratch_idx C] = I; - - SKC_RASTERIZE_VECTOR_EXPAND(); - - for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++) - ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]); -#endif - -#else - // - // SIMT - // - - // - // zero the volatile smem scratchpad using vector syntax - // - smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); - - // - // store source lane at starting lane - // - if (is_scratch_store) - smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); - - // - // propagate lanes to right using max scan - // - SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; - SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); -#endif - - return source; -} - -// -// sliver lines into subpixels -// - -static -void -skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - skc_block_id_t * const subblocks, - skc_block_id_v_t * const blocks, - skc_uint * const blocks_next, - skc_ttsk_v_t * const sk_v, - skc_uint * const sk_v_next, - __global skc_ttsk_s_t * const sk_extent, - __local struct skc_subgroup_smem volatile * const smem, - SKC_RASTERIZE_FLOAT const l0x, - SKC_RASTERIZE_FLOAT const l0y, - SKC_RASTERIZE_FLOAT const l1x, - SKC_RASTERIZE_FLOAT const l1y) -{ - // - // Y-SLIVERING - // ----------- - // - // immediately sliver all multi-pixel lines in into 1-pixel high - // lines - // - // note this implicitly squelches horizontal lines - // - // there is another test for horizontal lines after x-slivering - // is complete - // - - // - // will we need to flip the sign of y_delta ? - // - SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); - SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; - - // - // save 1/dy - // - SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); - - // - // how many non-horizontal subpixel y-axis slivers are there? - // - SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; - SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; - - // - // inclusive subgroup scan of y_segs - // - SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); - SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; - float y_rem = skc_subgroup_last_float(y_iss); - - // - // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails - // - if (y_segs == 0.0f) - y_iss = 0.0f; - -#if 0 - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); -#endif - - // - // these values don't matter on first iteration - // - SKC_RASTERIZE_FLOAT n1x_prev = 0; - SKC_RASTERIZE_FLOAT n1y_prev = 0; - - // - // loop until done - // - while (y_rem > 0.0f) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); - - // - // get line at y_source line - // - SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); - SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); - SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); - SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); - - // - // every lane will create a 1 pixel tall line "sliver" - // - // FIXME -- this gets expanded on SIMD - // - // if numerator == 1 then this is the first lane - // if numerator == s then this is the last lane - // - SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); - SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); - - SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); - - // toggle y_delta sign - SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); - - // - // calculate "right" line segment endpoint - // - SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; - SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); - SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); - - // - // override c1 if this is last point - // - n1y = select(n1y,m1y,is_y_last); - n1x = select(n1x,m1x,is_y_last); - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); - SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); - - // - // override shuffle up if this is the first line segment - // - n0y = select(n0y,m0y,is_y_first); - n0x = select(n0x,m0x,is_y_first); - - // - // save previous right endpoint - // - n1x_prev = n1x; - n1y_prev = n1y; - - // - // decrement by subgroup size - // - y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - -#if 0 - // - // debug - // - if (n0y != n1y) { - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); - } -#endif - - // - // X-SLIVERING - // ----------- - // - // now sliver 1-pixel high lines into at either vertical or - // 1-pixel wide lines - // - // save original direction and work with increasing x - // - SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); - SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; - - // - // save 1/dy - // - SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); - - // - // how many non-horizontal subpixel y-axis slivers are there? - // - SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); - SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; - SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); - - // - // inclusive subgroup scan of y_segs - // - SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); - SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; - float x_rem = skc_subgroup_last_float(x_iss); - - // - // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails - // - //if (x_segs == 0.0f) - // x_iss = 0.0f; - - // - // these values don't matter on first iteration - // - SKC_RASTERIZE_FLOAT p1x_prev = 0; - SKC_RASTERIZE_FLOAT p1y_prev = 0; - - // - // loop until done - // - while (x_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); - - // - // get line at y_source line - // - SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); - SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); - SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); - SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); - - // - // every lane will create a 1 pixel tall line "sliver" - // - // FIXME -- this gets expanded on SIMD - // - // if numerator == 1 then this is the first lane - // if numerator == s then this is the last lane - // - SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); - SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); - - SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); - - // toggle x_delta sign - SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); - - // - // calculate "right" line segment endpoint - // - SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; - SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); - SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); - - // - // override c1 if this is last point - // - p1x = select(p1x,o1x,is_x_last); - p1y = select(p1y,o1y,is_x_last); - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); - SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); - - // - // override shuffle up if this is the first line segment - // - p0x = select(p0x,o0x,is_x_first); - p0y = select(p0y,o0y,is_x_first); - - // - // save previous right endpoint - // - p1x_prev = p1x; - p1y_prev = p1y; - - // - // decrement by subgroup size - // - x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // only non-horizontal subpixel lines are valid - // - SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); - - // - // if no lanes are active then continue - // - // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY - // IMPACTS PERFORMANCE (+12% ?) - // - // IT SHOULDN'T !!! - // -#if 0 - if (!skc_subgroup_any(is_active)) - continue; -#endif - - // - // Option 1: use SLM for explicitly managed coalesced stores - // - // 1. which tile does this line belong? - // 2. hash tile coordinates - // 3. lookup hash - // 4. if tile matches then SLM append keys - // 5. if tile doesn't match - // a. flush - // b. create new TTSK_RYX - // c. obtain TTSB block from pool - // d. goto 3. - // - - // - // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores - // - // 1. which tile does this line belong? - // 2. hash tile coordinates - // 3. lookup hash - // 4. if tile matches then GMEM append keys - // 5. if tile doesn't match - // a. flush (and invalidate empty elems) - // b. create new TTSK_RYX - // c. obtain TTSB block from pool - // d. goto 3. - // - - // - // The virtual rasterization surface is very large and - // signed: +/- ~64K-256K, depending on the architecture. - // - // Rasters must be clipped to the virtual surface and, - // optionally, clipped even further on a per raster - // basis. - // - - // - // Clip to the per-raster clip - // - - /* - - CLIP HERE - - */ - - // - // Hash the tile coordinates - // - // This table lists nominal values for each architecture. - // We want to choose values that are naturally fit the - // "width" of the architecture. - // - // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS - // ---- ------- ---- --------- -------- --------- - // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? - // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* - // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? - // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* - // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon - // - // NOTE: When possible, bias the hash toward using more y - // bits because of: - // - // 1. the 90 degree counter-clockwise rotation that we put - // in place to offset the render-time clockwise - // rotation - // - // 2. the likely presence of left-to-right or - // right-to-left glyphs. - // - // For power-of-two bins, the hash is easy. - // - // For non-power-of-two, we may want to either implement a - // fast mod (compiler should do this for us... hahahaha) or - // drop down to the next power-of-two. - // - - // - // FIXME -- this snarl is not good -- can probably reduce - // some of the sign casting but some is there to vectorize a - // scalar - // - SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); - SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); - - SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); - SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); - - SKC_RASTERIZE_INT const min_y = min(z0y,z1y); - SKC_RASTERIZE_INT const max_y = max(z0y,z1y); - - SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; - - SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; - SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); - - // - // map [+1,+32] to [ 0,+31] - // map [-1,-32] to [-1,-32] - // - SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; - - SKC_RASTERIZE_INT const min_x = min(z0x,z1x); - SKC_RASTERIZE_INT const max_x = max(z0x,z1x); - SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; - - SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; - SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); - - SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; - - SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | - (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); - - SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); - -#if 0 - printf("(%3u, %3u)\n",tile_y,tile_x); -#endif - -#if 0 - if (is_active) - printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); -#endif - - // - // debug - // -#if 0 // PRINTF_ENABLE - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_active C) \ - printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); - - SKC_RASTERIZE_VECTOR_EXPAND(); -#else - if (is_active) - printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); -#endif - -#endif - // - // flush all active lanes - // - while (true) - { - // - // either gather load or vector load+shuffle the yx keys - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; - SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); -#else - SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; -#endif - - // - // does yx for lane match yx for hash? - // - SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; - SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); - - // - // OpenCL spec: "When casting a bool to a vector integer - // data type, the vector components will be set to -1 - // (i.e. all bits set) if the vector bool value is true - // and 0 otherwise. - // -#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) - SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; -#else - SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} -#endif - // - // how many new elements for each matching hash bin? - // - SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; - SKC_RASTERIZE_UINT const h = h_match << h_shl; - - // - // prefix sum all of the bins in parallel - // - SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); - SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); - - // - // current bin counts - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; - SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); -#else - SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; -#endif - - // - // calculate where each cache-hit and in-bounds tts should be stored - // - SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; - SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; - - // - // which lanes can append to a matching bin? - // - SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); - - // - // scatter append tts elements to bin blocks - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) - // - // SIMD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (is_append C) \ - { \ - smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ - smem->bin.aN.count[hash C] = count_new C; \ - } - - SKC_RASTERIZE_VECTOR_EXPAND(); -#else - // - // SIMT - // - if (is_append) - { - smem->bin.aN.ttsb [hash][ttsb_index] = tts; - smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS - } -#endif - // - // try to keep predicate updates SIMD-friendly and - // outside of predicated code paths -- this is not - // always how we would normally do things on SIMT but - // either approach is acceptable - // - - // - // mask off lanes/components that successfully appended - // - is_active = is_active && !is_append; - - // - // are there any active lanes left? - // - if (!skc_subgroup_any(is_active)) - break; - - // - // There are active lanes that couldn't be appended to a - // bin because their hashes collided with the bin's - // current ryx key then those bins must be ejected. - // - // Note that we do not eject "full" bins because lazily - // waiting for a collision results in simpler code. - // - skc_flush(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - subblocks, - blocks, - blocks_next, - sk_v, - sk_v_next, - sk_extent, - smem, - hash, - yx, - is_active); - } - } - } -} - -// -// INITIALIZE SMEM -// -// Note that SIMD/SIMT have nearly the same syntax. -// -static -void -skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) -{ - // - // initialize smem bins - // -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); - smem->bin.vN.count = ( 0 ); -#else - // - // SIMT - // - int idx = skc_subgroup_lane(); - -#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) - if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) -#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) - for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE) -#endif - { - smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); - smem->bin.aN.count[idx] = ( 0 ); - } -#endif -} - -// -// RASTERIZE CUBIC KERNEL -// - -static -void -skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only support perspective later - // - // the affine transformation requires 8 FMA + 2 ROUND operations - // - SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty); - - // - // - // -#if PRINTF_ENABLE - -#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ - " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ - b0x C,b0y C,t1x C,t1y C, \ - t2x C,t2y C,t3x C,t3y C); - - SKC_RASTERIZE_VECTOR_EXPAND(); - -#else - - printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", - b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); - -#endif - -#endif - - // - // OLD APPROACH - // ------------ - // - // The Spinel CUDA rasterizer was significantly more complex and - // performed a few different tasks that are probably best kept - // separate. - // - // The Spinel rasterizer Bezier held 4-element x and y coordinates - // in adjacent lanes. This simplified intermingling of single lane - // 4-coordinate line segments with two-lane cubic Beziers. - // - // After transformation of the input segments, the Spinel rasterizer - // would test cubics for flatness and, if flat, collapse the - // adjacent lanes into a single line lane and an empty lane. - // - // Any lines would then be appended to a line queue. - // - // Any cubics would then be subdivided. - // - // The reclassification process would be repeated. - // - // NEW APPROACH - // ------------ - // - // Assume we're only working with cubics in this kernel. - // - // Optimization: if the line segment is a special case -- a cusp, - // has 1+ inflections, or a loop -- it might be beneficial to - // subdivide the control cage 1+ times in order to separate the - // flatter segments the high-velocity region(s). - // - // This means we want to split using [a,b] formulation to _directly_ - // subdivide producing a new control cage. - // - // Wang's Formula is still useful even if we subdivide once or twice - // as it's so cheap that it might give some useful hints about where - // the high-velocity sections of curve reside. - // - // But it seems like using Wang's and directly flattening to line - // segments without any subdivision is good enough for the limited - // set of test cases that I've tried. - // - // So... use Wang's Formula to estimate how many line segment are - // required to properly flatten the cubics. - // - // Then use inclusive/exclusive scans to put all the lanes to work: - // - // 1. segmenting cubics to line segments - // - // 2. slivering line segments into 1-pixel high line segments - // - // 3. slivering 1-pixel high line segments into 1-pixel wide line - // segments - // - // MORE BACKGROUND ON NEW APPROACH - // ------------------------------- - // - // Two options for handling line segments: - // - // 1. append the line segments onto an SLM array until enough - // work has been accrued (Spinel does this) - // - // 2. immediately sliver the potentially multi-pixel line - // segments into subpixel lines - // - // The advantage of (1) is that it guarantees the slivering - // process will, on average, always be emitting a full subgroup - // of subpixel lines. - // - // The advantage of (2) is that it reduces code complexity and - // leaves more room for SLM tile bins. The difference between Spinel - // and Skia Compute is that Wang's Formula guarantees there will be - // a full subgroup of multi-pixel lines unless this is the final - // iteration of the warp of multi-pixel lines. - // - // Note that wider GPU architectures might benefit from (1) and - // other work accumulation strategies because it will minimize - // partial warp workloads in the final iteration of each stage. It - // also minimizes the sunk cost of the uniform control logic steps. - // - // So let's implement (2) for now... - // - - // - // And... begin! - // - // Estimate how many line segments are in quad/cubic curve. - // - // Wang's Formula will return zero if the control points are - // collinear but we bump it up to 1.0f. - // - SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); - - // - // if there are free registers then precalculate the reciprocal for - // each estimated segments since it will never change - // - SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); - - - // - // inclusive add scan of estimated line segments - // exclusive add scan of estimated line segments - // total number of estimated line segments - // - SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); - SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; - float s_rem = skc_subgroup_last_float(s_iss); // scalar - - // - // Precompute cubic polynomial coefficients from transformed control - // cage so we can shuffle them in on each iteration of the outer - // loop and then evaluate the polynomial in Horner form. - // - // | 1 0 0 0 | | c0 | - // | | | | - // | -3 3 0 0 | | c1 | - // B(t) = [ 1 t^1 t^2 t^3 ] | | | | - // | 3 -6 3 0 | | c2 | - // | | | | - // | -1 3 -3 1 | | c3 | - // - // - SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL - SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL - - SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL - SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL - - SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB - SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB - - // - // these values don't matter on the first iteration - // - SKC_RASTERIZE_FLOAT l1x_prev = 0; - SKC_RASTERIZE_FLOAT l1y_prev = 0; - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // loop until done - // - while (s_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); - - // - // every lane has a fraction to work off of - // - // FIXME -- this gets expanded on SIMD - // - // if delta == 1 then this is the first lane - // if count == s_segs then this is the last lane - // - SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); - SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); - - SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); - - // - // init parametric t - // - SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? - - // - // if last then override to a hard 1.0f - // - s_t = is_s_last ? 1.0f : s_t; - - // - // decrement by subgroup size - // - s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // now every lane knows what to do and the following lines will - // pump out up to SUBGROUP_SIZE line segments - // - // obtain the src vertices through shared or via a shuffle - // - - // - // shuffle in the polynomial coefficients their source lane - // - SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); - SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); - - SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); - SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); - - SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); - SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); - - SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); - SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); - - // - // calculate "right" line segment endpoint using Horner form - // - SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND - SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); - SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); - - // - // save previous right endpoint - // - l1x_prev = l1x; - l1y_prev = l1y; - - // - // override shuffle up if this is the first line segment - // - l0x = select(l0x,s0x,is_s_first); - l0y = select(l0y,s0y,is_s_first); - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - } - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// RASTERIZE QUAD KERNEL -// - -static -void -skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only support perspective later - // - // the affine transformation requires 8 FMA + 2 ROUND operations - // - SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; - SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; - - SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty); - - // - // Estimate how many line segments are in quad/cubic curve. - // - // Wang's Formula will return zero if the control points are - // collinear but we bump it up to 1.0f. - // - SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); - - // - // if there are free registers then precalculate the reciprocal for - // each estimated segments since it will never change - // - SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); - - - // - // inclusive add scan of estimated line segments - // exclusive add scan of estimated line segments - // total number of estimated line segments - // - SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); - SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; - float s_rem = skc_subgroup_last_float(s_iss); // scalar - - // - // Precompute quadratic polynomial coefficients from control cage so - // we can shuffle them in on each iteration of the outer loop and - // then evaluate the polynomial in Horner form. - // - - // | 1 0 0 | | c0 | - // | | | | - // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | - // | | | | - // | 1 -2 1 | | c2 | - // - // - SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL - SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL - - SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD - SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD - - // - // these values don't matter on the first iteration - // - SKC_RASTERIZE_FLOAT l1x_prev = 0; - SKC_RASTERIZE_FLOAT l1y_prev = 0; - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // loop until done - // - while (s_rem > 0) - { - // - // distribute work across lanes - // - SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); - - // - // every lane has a fraction to work off of - // - // FIXME -- this gets expanded on SIMD - // - // if delta == 1 then this is the first lane - // if count == s_segs then this is the last lane - // - SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); - SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); - - SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); - SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); - - // - // init parametric t - // - SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? - - // - // if last then override to a hard 1.0f - // - s_t = is_s_last ? 1.0f : s_t; - - // - // decrement by subgroup size - // - s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; - - // - // now every lane knows what to do and the following lines will - // pump out up to SUBGROUP_SIZE line segments - // - // obtain the src vertices through shared or via a shuffle - // - - // - // shuffle in the polynomial coefficients their source lane - // - SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); - SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); - - SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); - SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); - - SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); - SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); - - // - // calculate "right" line segment endpoint using Horner form - // - SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND - SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND - - // - // shuffle up "left" line segment endpoint - // - // NOTE: Intel's shuffle_up is unique with its elegant - // "previous" argument so don't get used to it - // - SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); - SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); - - // - // save previous right endpoint - // - l1x_prev = l1x; - l1y_prev = l1y; - - // - // override shuffle up if this is the first line segment - // - l0x = select(l0x,s0x,is_s_first); - l0y = select(l0y,s0y,is_s_first); - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - } - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// RASTERIZE LINE KERNEL -// - -static -void -skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __local struct skc_subgroup_smem volatile * const smem, - - skc_uint * const nodeword, - skc_block_id_t * const id, - - union skc_transform const * const tv, - union skc_path_clip const * const cv, - skc_uint const cohort) -{ - // - // the initial segment idx and segments-per-block constant determine - // how many block ids will need to be loaded - // - SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - - skc_segment_next(bp_elems,nodeword,id); - - SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; - -#if 0 - // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y); - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); -#endif - - // - // apply transform - // - // note that we only care if the end points are rounded to subpixel precision - // - // FIXME -- transformation is currently affine-only - // FIXME -- support perspective later - // - // the affine transformation requires 8 FMA + 4 ROUND operations - // - SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); - - SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx); - SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty); - -#if 0 - printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); -#endif - - // - // allocate and init in-register TTSK keys - // - skc_uint sk_v_next = 0; - skc_ttsk_v_t sk_v; - - sk_v.hi = cohort; - - // - // initialize smem - // - skc_smem_init(smem); - - // - // initialize blocks / subblocks - // - skc_block_id_v_t blocks; - skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; - -#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 - skc_block_id_t subblocks = 0; -#endif - - // - // sliver lines - // - skc_sliver(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &subblocks, - &blocks, - &blocks_next, - &sk_v, - &sk_v_next, - sk_extent, - smem, - l0x,l0y,l1x,l1y); - - // - // - flush work-in-progress blocks - // - return unused block ids - // - skc_finalize(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - &blocks, - blocks_next, - &sk_v, - sk_v_next, - sk_extent, - smem); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - -#if 0 - if (get_sub_group_local_id() == 0) - printf("+cmd_idx = %u\n",cmd_idx); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("-cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("[ %u ]< %u, %u, %u, %u >\n", - cmd_idx, - cmd.nodeword, - SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), - SKC_CMD_RASTERIZE_GET_CLIP(cmd), - SKC_CMD_RASTERIZE_GET_COHORT(cmd)); -#endif - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - switch (tag) - { - case SKC_BLOCK_ID_TAG_PATH_LINE: - skc_rasterize_lines(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_QUAD: - skc_rasterize_quads(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_CUBIC: - skc_rasterize_cubics(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); - break; - - case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: - break; - case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: - break; - - default: - break; - } -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_lines(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_quads(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - // - // declare shared memory block - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - __local struct skc_subgroup_smem volatile smem[1]; -#else - __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; - __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); -#endif - - // - // this is a subgroup/warp-centric kernel - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler appears to be recognizing - // get_group_id(0) as a uniform but the alternative calculation used - // when there are multiple subgroups per workgroup is not - // cooperating and driving spillage elsewhere. - // -#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) - uint const cmd_idx = get_group_id(0); -#else - uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // if worksgroups are multi-subgroup then there may be excess - // subgroups in the final workgroup - // - if (cmd_idx >= count) - return; - -#if 0 - if (get_sub_group_local_id() == 0) - printf("cmd_idx = %u\n",cmd_idx); -#endif - - // - // load a single command for this subgroup - // - union skc_cmd_rasterize const cmd = cmds[cmd_idx]; - - // - // get first block node command word and its subblock - // - skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing - skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; - skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); - - // - // load transform -- uniform across subgroup - // - // v8: { sx shx tx shy sy ty w0 w1 } - // - // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: - // - // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] - // - // Coordinates are scaled to subpixel resolution. All that matters - // is that continuity is maintained between end path element - // endpoints. - // - // It's the responsibility of the host to ensure that the transforms - // are properly scaled either via intitializing a transform stack - // with the subpixel resolution scaled identity or scaling the - // transform before its loaded by a rasterization grid. - // - // FIXME -- horizontal load might be better than this broadcast load - // - union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load - union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load - skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted - - skc_rasterize_cubics(bp_atomics, - bp_elems, - bp_ids, - bp_mask, - cohort_atomics, - sk_extent, - smem, - &nodeword,&id, - &tv,&cv,cohort); -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - ; -} - -// -// -// - -__kernel -SKC_RASTERIZE_KERNEL_ATTRIBS -void -skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global union skc_bp_elem * const bp_elems, - __global uint * const bp_ids, - skc_uint const bp_mask, - - __global SKC_ATOMIC_UINT volatile * const cohort_atomics, - __global skc_ttsk_s_t * const sk_extent, - - __global float8 const * const transforms, // FIXME -- __constant - __global float4 const * const clips, // FIXME -- __constant - __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant - skc_uint const count) -{ - ; -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+// #define SKC_ARCH_AVX2
+// #define SKC_RASTERIZE_SIMD_USES_SMEM
+
+#define PRINTF_ENABLE 0
+#define PRINTF_BLOCK_COUNT 0
+
+//
+// NOTE:
+//
+// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
+// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
+//
+// NOTE:
+//
+// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP.
+//
+//
+
+#if 0 // SKC_ARCH_AVX2
+
+// #define SKC_RASTERIZE_SUBGROUP_SIZE 1
+// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3
+// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1
+
+// #define SKC_TTXB_WORDS 8
+
+// #define SKC_RASTERIZE_FLOAT float8
+// #define SKC_RASTERIZE_UINT uint8
+// #define SKC_RASTERIZE_INT int8
+// #define SKC_RASTERIZE_PREDICATE int8
+
+// #define SKC_RASTERIZE_BIN_BLOCK uint16
+// #define SKC_RASTERIZE_BIN uint8
+
+// #define SKC_RASTERIZE_POOL uint8
+// #define SKC_RASTERIZE_POOL_SCALE 6
+
+// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1
+// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2
+
+// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8()
+
+#endif
+
+//
+// SIMT
+//
+
+#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
+#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 }
+#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 }
+
+//
+//
+//
+
+#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
+#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
+#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+// t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+// a + (b - a) * t
+//
+// But this may be a native instruction on some devices. For example,
+// on GEN9 there is an LRP "linear interoplation" opcode but it
+// doesn't appear to support half floats.
+//
+// Feel free to toggle this option and then benchmark and inspect the
+// generated code. We really want the double FMA to be generated when
+// there isn't support for a LERP/MIX operation.
+//
+
+#if 1
+#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t) mix(a,b,t)
+#endif
+
+//
+// There is no integer MAD in OpenCL with "don't care" overflow
+// semantics.
+//
+// FIXME -- verify if the platform needs explicit MAD operations even
+// if a "--fastmath" option is available at compile time. It might
+// make sense to explicitly use MAD calls if the platform requires it.
+//
+
+#if 1
+#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c))
+#else
+#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c)
+#endif
+
+//
+//
+//
+
+#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
+
+//
+//
+//
+
+union skc_bp_elem
+{
+ skc_uint u32;
+ skc_tagged_block_id_t tag_id;
+ skc_float coord;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+ //
+ // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
+ //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
+ struct {
+ union {
+
+ skc_uint winner;
+
+ struct {
+ skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+ } aN;
+
+ struct {
+ SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+ } vN;
+ };
+ } subgroup;
+#endif
+
+ //
+ // work-in-progress TTSB blocks and associated YX keys
+ //
+ union {
+ struct {
+ // FIXME -- some typedefs are valid here
+ skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
+ skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+ skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+ skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+ } aN;
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ struct {
+ SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+ SKC_RASTERIZE_BIN yx;
+ SKC_RASTERIZE_BIN id;
+ SKC_RASTERIZE_BIN count;
+ } vN;
+#endif
+ } bin;
+};
+
+//
+//
+//
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+#define skc_subgroup_lane() 0
+#else
+#define skc_subgroup_lane() get_sub_group_local_id()
+#endif
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint * const blocks_next,
+ skc_block_id_v_t * const blocks,
+ __global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+{
+ //
+ // get a new vector of block ids -- this is kind of a narrow
+ // allocation but subblocks help stretch out the pool.
+ //
+ // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
+ //
+ skc_uint bp_idx = 0;
+
+ if (skc_subgroup_lane() == 0)
+ {
+ bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
+ SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
+#if 0
+ printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
+#endif
+ }
+
+ bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
+ *blocks = bp_ids[bp_idx];
+ *blocks_next = 0;
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint * const blocks_next,
+ skc_block_id_v_t * const blocks,
+ __global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+{
+ // replenish?
+ if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
+ {
+ skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+ }
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
+ //
+ // SIMT
+ //
+ skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+ //
+ // SIMD
+ //
+ skc_block_id_t id = blocks->s0;
+
+ skc_shuffle_down_1(*blocks);
+
+#endif
+
+ *blocks_next += 1;
+
+ return id;
+}
+
+//
+// subblock allocator
+//
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+
+static
+skc_block_id_t
+skc_subblocks_get_next(skc_block_id_t * const subblocks,
+ skc_uint * const blocks_next,
+ skc_block_id_v_t * const blocks,
+ __global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids)
+{
+ if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+ {
+ *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+ }
+
+ skc_block_id_t const sb_id = *subblocks;
+
+ *subblocks += 1;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("= %u\n",sb_id);
+#endif
+
+ return sb_id;
+}
+
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks
+
+#else
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks
+
+#endif
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
+ skc_uint * const blocks_next,
+ __global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const bp_ids,
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ skc_ttsk_v_t * const sk_v,
+ skc_uint * const sk_v_next,
+ __global skc_ttsk_s_t * const sk_extent,
+ skc_uint const new_yx)
+{
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+ skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
+ blocks_next,
+ blocks,
+ bp_atomics,
+ bp_mask,
+ bp_ids);
+#else
+ skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
+ blocks,
+ bp_atomics,
+ bp_mask, // pow2 modulo mask for block pool ring
+ bp_ids);
+#endif
+
+ if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
+ {
+ sk_v->lo = new_id;
+ sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
+#if 0
+ printf("@ ( %3u, %3u ) %u\n",
+ (new_yx >> 12) & 0xFFF,
+ (new_yx ) & 0xFFF,
+ new_id);
+#endif
+ }
+
+ *sk_v_next += 1;
+
+ if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
+ {
+ *sk_v_next = 0;
+
+ skc_uint sk_idx = 0;
+
+ if (skc_subgroup_lane() == 0)
+ {
+ sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+ (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
+#if 0
+ printf("+ %u\n",sk_idx);
+#endif
+ }
+
+ sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
+ if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
+#endif
+ {
+ sk_extent[sk_idx] = *sk_v;
+#if 0
+ printf("> %u : %v2u\n",sk_idx,*sk_v);
+#endif
+ }
+ }
+
+ return new_id;
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ // Note that there isn't a built-in horizontal scan for vectors so
+ // we'll define some here for various widths.
+ //
+ // FIXME -- a scalar version might be faster so put in a
+ // compile-time switch to selection between implementations
+ //
+
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ // 01
+ // 0 +
+ // --
+ // 01
+ SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
+ return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ // 0123
+ // 012 +
+ // ----
+ // 0123
+ // 01 +
+ // ----
+ // 0123
+ //
+ SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
+ SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
+ return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ // 01234567
+ // 0123456 +
+ // --------
+ // 01234567
+ // 012345 +
+ // --------
+ // 01234567
+ // 0123 +
+ // --------
+ // 01234567
+ //
+ SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
+ SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
+ SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
+ return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ // 0123456789abcdef
+ // 0123456789abcde +
+ // ----------------
+ // 0123456789abcdef
+ // 0123456789abcd +
+ // ----------------
+ // 0123456789abcdef
+ // 0123456789ab +
+ // ----------------
+ // 0123456789abcdef
+ // 01234567 +
+ // ----------------
+ // 0123456789abcdef
+ //
+ SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+ SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+ SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+ SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+ return z;
+
+#endif
+
+#else
+ //
+ // SIMT
+ //
+
+ return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ // Note that there isn't a built-in horizontal scan for vectors so
+ // we'll define some here for various widths.
+ //
+ // FIXME -- a scalar version might be faster so put in a
+ // compile-time switch to selection between implementations
+ //
+
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ // 01
+ // 0 +
+ // --
+ // 01
+ SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
+ return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ // 0123
+ // 012 +
+ // ----
+ // 0123
+ // 01 +
+ // ----
+ // 0123
+ //
+ SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
+ SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
+ return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ // 01234567
+ // 0123456 +
+ // --------
+ // 01234567
+ // 012345 +
+ // --------
+ // 01234567
+ // 0123 +
+ // --------
+ // 01234567
+ //
+ SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
+ SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
+ SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
+ return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ // 0123456789abcdef
+ // 0123456789abcde +
+ // ----------------
+ // 0123456789abcdef
+ // 0123456789abcd +
+ // ----------------
+ // 0123456789abcdef
+ // 0123456789ab +
+ // ----------------
+ // 0123456789abcdef
+ // 01234567 +
+ // ----------------
+ // 0123456789abcdef
+ //
+ SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+ SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+ SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+ SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+ return z;
+
+#endif
+
+#else
+ //
+ // SIMT
+ //
+
+ return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ // Note that there isn't a built-in horizontal scan for vectors so
+ // we'll define some here for various widths.
+ //
+ // FIXME -- a scalar version might be faster so put in a
+ // compile-time switch to selection between implementations
+ //
+
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ // 01
+ // 00 max
+ // --
+ // 01
+ SKC_RASTERIZE_UINT const w = max(v.s00,v);
+ return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ // 0123
+ // 0012 +
+ // ----
+ // 0123
+ // 0101 +
+ // ----
+ // 0123
+ //
+ SKC_RASTERIZE_UINT const w = max(v.s0012,v);
+ SKC_RASTERIZE_UINT const x = max(w.s0101,w);
+ return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ // 01234567
+ // 00123456 +
+ // --------
+ // 01234567
+ // 01012345 +
+ // --------
+ // 01234567
+ // 01230123 +
+ // --------
+ // 01234567
+ //
+ SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
+ SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
+ SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
+ return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ // 0123456789abcdef
+ // 00123456789abcde +
+ // ----------------
+ // 0123456789abcdef
+ // 010123456789abcd +
+ // ----------------
+ // 0123456789abcdef
+ // 01230123456789ab +
+ // ----------------
+ // 0123456789abcdef
+ // 0123456701234567 +
+ // ----------------
+ // 0123456789abcdef
+ //
+ SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
+ SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
+ SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
+ SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
+ return z;
+
+#endif
+
+#else
+ //
+ // SIMT
+ //
+
+ return sub_group_scan_inclusive_max(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ return v.sf;
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ return v.sf;
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+#else
+ return v.s0;
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return sub_group_broadcast(v,0);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
+ SKC_RASTERIZE_UINT const i)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return v;
+#else
+ return shuffle(v,i);
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return intel_sub_group_shuffle(v,i);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
+ SKC_RASTERIZE_FLOAT const c) // current
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ // FIXME -- there are alternative formulations here:
+ //
+ // Option 1:
+ //
+ // select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
+ //
+ // Option 2:
+ //
+ // p is a scalar
+ // t = c.rotate(+1)
+ // t.s0 = p;
+ //
+ // Option 3: ...
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return p;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ return shuffle2(p,c,(uint2)(1,2));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ return shuffle2(p,c,(uint4)(3,4,5,6));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return intel_sub_group_shuffle_up(p,c,1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_is_lane_first()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+ //
+ // SIMD
+ //
+ return true;
+#else
+ //
+ // SIMT
+ //
+ return get_sub_group_local_id() == 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_delta_offset()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ return 1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+ return (SKC_RASTERIZE_FLOAT)( 1, 2 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+ return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+ return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+ return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
+#endif
+
+#else
+ //
+ // SIMT
+ //
+ return 1.0f + get_sub_group_local_id();
+
+#endif
+
+}
+
+//
+//
+//
+
+static
+int
+skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ return any(p);
+#else
+ //
+ // SIMT
+ //
+ return sub_group_any(p);
+#endif
+}
+
+//
+//
+//
+
+#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
+
+void
+skc_segment_next(__global union skc_bp_elem * const bp_elems,
+ skc_uint * const nodeword,
+ skc_block_id_t * const id)
+{
+ if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+ {
+ if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
+ {
+ *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
+ }
+
+ skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
+
+ *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+ }
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
+{
+ return native_sqrt(x * x + y * y);
+}
+
+//
+// Wang's Formula (1985)
+//
+
+#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned
+
+#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
+
+#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
+#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON))
+
+#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y)
+#define SKC_WANG_SQRT(x) native_sqrt(x)
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+ SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+ SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
+ SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
+{
+ //
+ // Return the number of evenly spaced (in the parametric sense) line
+ // segments that are guaranteed to be within "epsilon" error of the
+ // curve.
+ //
+ // We're then going to take multiples of the reciprocal of this
+ // number so that the segmentation can be distributed across the
+ // subgroup.
+ //
+ // Note, this can probably be slightly optimized per architecture
+ // but it's probably far from being a hotspot since it's all
+ // straight-line unpredicated code.
+ //
+ // The result is an integer ranging from [1.0,#segments]
+ //
+ // Note that even if all of the control points are coincident, the
+ // max(1.0f) will categorize this as a line of 1 segment.
+ //
+ // This is what we want! We want to convert cubics to lines as
+ // easily as possible and *then* cull lines that are either
+ // horizontal or zero length.
+ //
+ return max(1.0f,
+ ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
+ SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
+ fabs(t3x - 2.0f * t2x + t1x)),
+ max(fabs(t2y - 2.0f * t1y + t0y),
+ fabs(t3y - 2.0f * t2y + t1y))))));
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+ SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+ SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
+{
+ return max(1.0f,
+ ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
+ SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
+ fabs(t2y - 2.0f * t1y + t0y)))));
+}
+
+//
+// rational curves
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic_rat()
+{
+ return 0.0f;
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quad_rat()
+{
+ return 0.0f;
+}
+
+//
+// flush any work-in-progress blocks and return unused block ids
+//
+
+static
+void
+skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ skc_block_id_v_t * const blocks,
+ skc_uint const blocks_next,
+ skc_ttsk_v_t * const sk_v,
+ skc_uint const sk_v_next,
+ __global skc_ttsk_s_t * const sk_extent,
+ __local struct skc_subgroup_smem volatile * const smem)
+{
+ //
+ // flush non-empty bins
+ //
+ // FIXME -- accelerate this iteration/search with a subgroup operation
+ //
+ for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
+ {
+ if (smem->bin.aN.count[ii] > 0)
+ {
+ skc_block_id_v_t const id = smem->bin.aN.id[ii];
+ skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+ skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
+#if 0
+ printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
+#endif
+ bp_elems[idx].u32 = tts;
+ }
+
+ //
+ // FIXME -- vectorize with vstoreN()
+ //
+ }
+
+ //
+ // return remaining block ids back to the pool
+ //
+ skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
+
+ if (blocks_rem > 0)
+ {
+ skc_uint bp_idx = 0;
+
+ if (skc_subgroup_lane() == 0)
+ {
+ bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
+
+#if 0
+ printf("r-: %8u + %u\n",bp_idx,blocks_rem);
+#endif
+ }
+
+ bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
+
+ if (skc_subgroup_lane() >= blocks_next)
+ {
+ bp_ids[bp_idx] = *blocks;
+ }
+ }
+
+ //
+ // flush work-in-progress ryx keys
+ //
+ if (sk_v_next > 0)
+ {
+ skc_uint sk_idx = 0;
+
+ if (skc_subgroup_lane() == 0)
+ {
+ sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+ (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
+#if 0
+ printf("* %u\n",sk_idx);
+#endif
+ }
+
+ sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+ if (skc_subgroup_lane() < sk_v_next)
+ {
+ sk_extent[sk_idx] = *sk_v;
+ }
+ }
+}
+
+//
+// If there are lanes that were unable to append to a bin because
+// their hashes collided with a bin's current ryx key then those bins
+// must be ejected.
+//
+// Note that we do not eject "full" bins because lazily waiting for a
+// collision results in simpler code.
+//
+
+static
+void
+skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ skc_block_id_t * const subblocks,
+ skc_block_id_v_t * const blocks,
+ skc_uint * const blocks_next,
+ skc_ttsk_v_t * const sk_v,
+ skc_uint * const sk_v_next,
+ __global skc_ttsk_s_t * const sk_extent,
+ __local struct skc_subgroup_smem volatile * const smem,
+ SKC_RASTERIZE_UINT const hash,
+ SKC_RASTERIZE_UINT const yx,
+ SKC_RASTERIZE_PREDICATE is_collision) // pass by value
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+
+ //
+ // FIXME -- this code is now stale with the changes to the
+ // subblock/block allocation strategy
+ //
+
+ //
+ // get local TTSB ID queue count
+ //
+ skc_uint ttsb_id_count = smem->pool.count; // scalar
+
+ // init hash bit mask
+ skc_uint component_mask = 0;
+
+ for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
+ {
+ // if no collision continue
+ if (((int*)&is_collision)[cc] == 0)
+ continue;
+
+ uint const winner = ((uint*)&hash)[cc];
+ uint const component_bit = 1u << winner;
+
+ // if already processed this hash then continue
+ if (component_mask & component_bit)
+ continue;
+
+ // update component mask
+ component_mask |= component_bit;
+
+ //
+ // new winner requires ejecting the old TTSB
+ //
+ if (smem->bin.aN.count[winner] > 0)
+ {
+ skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+ bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+ }
+
+ //
+ // ensure there is at least one TTSK and TTSB ID
+ //
+ if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
+ {
+ //
+ // update remaining count
+ //
+ ttsb_id_count = 0;
+
+ //
+ // flush accumulated ttsk_ryx keys
+ //
+ uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+ (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
+
+#if 0
+ printf("# %u\n",idx);
+#endif
+
+ for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+ {
+ ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
+ }
+
+ //
+ // allocate more ttsb ids from pool
+ //
+ uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
+
+ for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+ smem->pool.aN.id[ii] = bp_ids[id + ii];
+ }
+
+ //
+ // invalidate the winning block
+ //
+
+ //
+ // update bin with winning yx, new ttsb id and zero count
+ //
+ // all lanes are loading/storing from/to the same index
+ //
+ smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
+ smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count];
+ smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
+ smem->bin.aN.count[winner] = 0;
+
+ //
+ // update count
+ //
+ ttsb_id_count += 1;
+ }
+
+ //
+ // save count
+ //
+ smem->pool.count = ttsb_id_count;
+
+#else
+ //
+ // SIMT
+ //
+
+ do {
+ //
+ // only one lane will win!
+ //
+ if (is_collision)
+ smem->subgroup.winner = hash;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //
+ // which bin is being ejected?
+ //
+ skc_uint const winner = smem->subgroup.winner;
+
+ //
+ // which colliding hash is taking over the bin?
+ //
+ SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
+
+ //
+ // all lanes with the same hash will try to store but only one
+ // lane will win
+ //
+ if (is_winner)
+ smem->subgroup.winner = yx;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //
+ // flush this block to the pool
+ //
+ if (smem->bin.aN.count[winner] > 0)
+ {
+ skc_block_id_v_t const id = smem->bin.aN.id[winner];
+ skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+ skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+#if 0
+ printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
+#endif
+ bp_elems[idx].u32 = tts;
+ }
+
+ //
+ // append new ttsk
+ //
+ skc_uint const new_yx = smem->subgroup.winner;
+ skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
+ blocks_next,
+ bp_atomics,
+ bp_mask, // pow2 modulo mask for block pool ring
+ bp_ids,
+ cohort_atomics,
+ sk_v,
+ sk_v_next,
+ sk_extent,
+ new_yx);
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf(">>> %9u\n",new_id);
+ }
+#endif
+
+ //
+ // update bin with winning yx, new ttsb id and zero count
+ //
+ smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
+ smem->bin.aN.yx [winner] = new_yx;
+ smem->bin.aN.id [winner] = new_id;
+ smem->bin.aN.count[winner] = 0;
+
+ //
+ // remove all lanes matching this hash
+ //
+ is_collision = is_collision && !is_winner;
+
+ //
+ // exit if nothing left to do
+ //
+ } while (sub_group_any(is_collision));
+
+#endif
+}
+
+//
+// scatter scan max
+//
+static
+SKC_RASTERIZE_UINT
+skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
+ SKC_RASTERIZE_FLOAT const iss,
+ SKC_RASTERIZE_FLOAT const ess)
+{
+ //
+ // prefix sums determine which lanes we're going to work on next
+ //
+ SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
+ SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
+ //
+ // SIMD APPROACH 1: SIMT'ISH
+ //
+
+ // zero the volatile smem scratchpad using vector syntax
+ smem->subgroup.vN.scratch[0] = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (is_scratch_store C) \
+ smem->subgroup.aN.scratch[scratch_idx C] = I;
+
+ SKC_RASTERIZE_VECTOR_EXPAND();
+
+ // propagate lanes to right using max scan
+ SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
+ SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch);
+
+#else
+ //
+ // SIMD APPROACH 2: SCALAR'ISH
+ //
+
+ SKC_RASTERIZE_UINT source = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (is_scratch_store C) \
+ ((uint *)&source)[scratch_idx C] = I;
+
+ SKC_RASTERIZE_VECTOR_EXPAND();
+
+ for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
+ ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
+#endif
+
+#else
+ //
+ // SIMT
+ //
+
+ //
+ // zero the volatile smem scratchpad using vector syntax
+ //
+ smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
+
+ //
+ // store source lane at starting lane
+ //
+ if (is_scratch_store)
+ smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
+
+ //
+ // propagate lanes to right using max scan
+ //
+ SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
+ SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch);
+#endif
+
+ return source;
+}
+
+//
+// sliver lines into subpixels
+//
+
+static
+void
+skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ skc_block_id_t * const subblocks,
+ skc_block_id_v_t * const blocks,
+ skc_uint * const blocks_next,
+ skc_ttsk_v_t * const sk_v,
+ skc_uint * const sk_v_next,
+ __global skc_ttsk_s_t * const sk_extent,
+ __local struct skc_subgroup_smem volatile * const smem,
+ SKC_RASTERIZE_FLOAT const l0x,
+ SKC_RASTERIZE_FLOAT const l0y,
+ SKC_RASTERIZE_FLOAT const l1x,
+ SKC_RASTERIZE_FLOAT const l1y)
+{
+ //
+ // Y-SLIVERING
+ // -----------
+ //
+ // immediately sliver all multi-pixel lines in into 1-pixel high
+ // lines
+ //
+ // note this implicitly squelches horizontal lines
+ //
+ // there is another test for horizontal lines after x-slivering
+ // is complete
+ //
+
+ //
+ // will we need to flip the sign of y_delta ?
+ //
+ SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y);
+ SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000;
+
+ //
+ // save 1/dy
+ //
+ SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
+
+ //
+ // how many non-horizontal subpixel y-axis slivers are there?
+ //
+ SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+ SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+ SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max;
+ SKC_RASTERIZE_FLOAT y_segs = y_max - y_min;
+
+ //
+ // inclusive subgroup scan of y_segs
+ //
+ SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs);
+ SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs;
+ float y_rem = skc_subgroup_last_float(y_iss);
+
+ //
+ // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
+ //
+ if (y_segs == 0.0f)
+ y_iss = 0.0f;
+
+#if 0
+ printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
+#endif
+
+ //
+ // these values don't matter on first iteration
+ //
+ SKC_RASTERIZE_FLOAT n1x_prev = 0;
+ SKC_RASTERIZE_FLOAT n1y_prev = 0;
+
+ //
+ // loop until done
+ //
+ while (y_rem > 0.0f)
+ {
+ //
+ // distribute work across lanes
+ //
+ SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
+
+ //
+ // get line at y_source line
+ //
+ SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
+ SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
+ SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
+ SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
+
+ //
+ // every lane will create a 1 pixel tall line "sliver"
+ //
+ // FIXME -- this gets expanded on SIMD
+ //
+ // if numerator == 1 then this is the first lane
+ // if numerator == s then this is the last lane
+ //
+ SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
+ SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source);
+
+ SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
+ SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count);
+
+ // toggle y_delta sign
+ SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
+
+ //
+ // calculate "right" line segment endpoint
+ //
+ SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
+ SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
+ SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t));
+
+ //
+ // override c1 if this is last point
+ //
+ n1y = select(n1y,m1y,is_y_last);
+ n1x = select(n1x,m1x,is_y_last);
+
+ //
+ // shuffle up "left" line segment endpoint
+ //
+ // NOTE: Intel's shuffle_up is unique with its elegant
+ // "previous" argument so don't get used to it
+ //
+ SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
+ SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
+
+ //
+ // override shuffle up if this is the first line segment
+ //
+ n0y = select(n0y,m0y,is_y_first);
+ n0x = select(n0x,m0x,is_y_first);
+
+ //
+ // save previous right endpoint
+ //
+ n1x_prev = n1x;
+ n1y_prev = n1y;
+
+ //
+ // decrement by subgroup size
+ //
+ y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+#if 0
+ //
+ // debug
+ //
+ if (n0y != n1y) {
+ printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
+ }
+#endif
+
+ //
+ // X-SLIVERING
+ // -----------
+ //
+ // now sliver 1-pixel high lines into at either vertical or
+ // 1-pixel wide lines
+ //
+ // save original direction and work with increasing x
+ //
+ SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x);
+ SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000;
+
+ //
+ // save 1/dy
+ //
+ SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x);
+
+ //
+ // how many non-horizontal subpixel y-axis slivers are there?
+ //
+ SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+ SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+ SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max;
+ SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f);
+
+ //
+ // inclusive subgroup scan of y_segs
+ //
+ SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs);
+ SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs;
+ float x_rem = skc_subgroup_last_float(x_iss);
+
+ //
+ // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
+ //
+ //if (x_segs == 0.0f)
+ // x_iss = 0.0f;
+
+ //
+ // these values don't matter on first iteration
+ //
+ SKC_RASTERIZE_FLOAT p1x_prev = 0;
+ SKC_RASTERIZE_FLOAT p1y_prev = 0;
+
+ //
+ // loop until done
+ //
+ while (x_rem > 0)
+ {
+ //
+ // distribute work across lanes
+ //
+ SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
+
+ //
+ // get line at y_source line
+ //
+ SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
+ SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
+ SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
+ SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
+
+ //
+ // every lane will create a 1 pixel tall line "sliver"
+ //
+ // FIXME -- this gets expanded on SIMD
+ //
+ // if numerator == 1 then this is the first lane
+ // if numerator == s then this is the last lane
+ //
+ SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
+ SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source);
+
+ SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
+ SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count);
+
+ // toggle x_delta sign
+ SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
+
+ //
+ // calculate "right" line segment endpoint
+ //
+ SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
+ SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
+ SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t));
+
+ //
+ // override c1 if this is last point
+ //
+ p1x = select(p1x,o1x,is_x_last);
+ p1y = select(p1y,o1y,is_x_last);
+
+ //
+ // shuffle up "left" line segment endpoint
+ //
+ // NOTE: Intel's shuffle_up is unique with its elegant
+ // "previous" argument so don't get used to it
+ //
+ SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
+ SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
+
+ //
+ // override shuffle up if this is the first line segment
+ //
+ p0x = select(p0x,o0x,is_x_first);
+ p0y = select(p0y,o0y,is_x_first);
+
+ //
+ // save previous right endpoint
+ //
+ p1x_prev = p1x;
+ p1y_prev = p1y;
+
+ //
+ // decrement by subgroup size
+ //
+ x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+ //
+ // only non-horizontal subpixel lines are valid
+ //
+ SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
+
+ //
+ // if no lanes are active then continue
+ //
+ // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
+ // IMPACTS PERFORMANCE (+12% ?)
+ //
+ // IT SHOULDN'T !!!
+ //
+#if 0
+ if (!skc_subgroup_any(is_active))
+ continue;
+#endif
+
+ //
+ // Option 1: use SLM for explicitly managed coalesced stores
+ //
+ // 1. which tile does this line belong?
+ // 2. hash tile coordinates
+ // 3. lookup hash
+ // 4. if tile matches then SLM append keys
+ // 5. if tile doesn't match
+ // a. flush
+ // b. create new TTSK_RYX
+ // c. obtain TTSB block from pool
+ // d. goto 3.
+ //
+
+ //
+ // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
+ //
+ // 1. which tile does this line belong?
+ // 2. hash tile coordinates
+ // 3. lookup hash
+ // 4. if tile matches then GMEM append keys
+ // 5. if tile doesn't match
+ // a. flush (and invalidate empty elems)
+ // b. create new TTSK_RYX
+ // c. obtain TTSB block from pool
+ // d. goto 3.
+ //
+
+ //
+ // The virtual rasterization surface is very large and
+ // signed: +/- ~64K-256K, depending on the architecture.
+ //
+ // Rasters must be clipped to the virtual surface and,
+ // optionally, clipped even further on a per raster
+ // basis.
+ //
+
+ //
+ // Clip to the per-raster clip
+ //
+
+ /*
+
+ CLIP HERE
+
+ */
+
+ //
+ // Hash the tile coordinates
+ //
+ // This table lists nominal values for each architecture.
+ // We want to choose values that are naturally fit the
+ // "width" of the architecture.
+ //
+ // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS
+ // ---- ------- ---- --------- -------- ---------
+ // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ?
+ // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX*
+ // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,?
+ // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN*
+ // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon
+ //
+ // NOTE: When possible, bias the hash toward using more y
+ // bits because of:
+ //
+ // 1. the 90 degree counter-clockwise rotation that we put
+ // in place to offset the render-time clockwise
+ // rotation
+ //
+ // 2. the likely presence of left-to-right or
+ // right-to-left glyphs.
+ //
+ // For power-of-two bins, the hash is easy.
+ //
+ // For non-power-of-two, we may want to either implement a
+ // fast mod (compiler should do this for us... hahahaha) or
+ // drop down to the next power-of-two.
+ //
+
+ //
+ // FIXME -- this snarl is not good -- can probably reduce
+ // some of the sign casting but some is there to vectorize a
+ // scalar
+ //
+ SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
+ SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
+
+ SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
+ SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
+
+ SKC_RASTERIZE_INT const min_y = min(z0y,z1y);
+ SKC_RASTERIZE_INT const max_y = max(z0y,z1y);
+
+ SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
+
+ SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
+ SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
+
+ //
+ // map [+1,+32] to [ 0,+31]
+ // map [-1,-32] to [-1,-32]
+ //
+ SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26;
+
+ SKC_RASTERIZE_INT const min_x = min(z0x,z1x);
+ SKC_RASTERIZE_INT const max_x = max(z0x,z1x);
+ SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
+
+ SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
+ SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
+
+ SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx;
+
+ SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
+ (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
+
+ SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
+
+#if 0
+ printf("(%3u, %3u)\n",tile_y,tile_x);
+#endif
+
+#if 0
+ if (is_active)
+ printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
+#endif
+
+ //
+ // debug
+ //
+#if 0 // PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (is_active C) \
+ printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
+
+ SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+ if (is_active)
+ printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
+#endif
+
+#endif
+ //
+ // flush all active lanes
+ //
+ while (true)
+ {
+ //
+ // either gather load or vector load+shuffle the yx keys
+ //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx;
+ SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash);
+#else
+ SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash];
+#endif
+
+ //
+ // does yx for lane match yx for hash?
+ //
+ SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
+ SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx);
+
+ //
+ // OpenCL spec: "When casting a bool to a vector integer
+ // data type, the vector components will be set to -1
+ // (i.e. all bits set) if the vector bool value is true
+ // and 0 otherwise.
+ //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+ SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match;
+#else
+ SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0}
+#endif
+ //
+ // how many new elements for each matching hash bin?
+ //
+ SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
+ SKC_RASTERIZE_UINT const h = h_match << h_shl;
+
+ //
+ // prefix sum all of the bins in parallel
+ //
+ SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h);
+ SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss);
+
+ //
+ // current bin counts
+ //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count;
+ SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash);
+#else
+ SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash];
+#endif
+
+ //
+ // calculate where each cache-hit and in-bounds tts should be stored
+ //
+ SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
+ SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
+
+ //
+ // which lanes can append to a matching bin?
+ //
+ SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
+
+ //
+ // scatter append tts elements to bin blocks
+ //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+ //
+ // SIMD
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (is_append C) \
+ { \
+ smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \
+ smem->bin.aN.count[hash C] = count_new C; \
+ }
+
+ SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+ //
+ // SIMT
+ //
+ if (is_append)
+ {
+ smem->bin.aN.ttsb [hash][ttsb_index] = tts;
+ smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
+ }
+#endif
+ //
+ // try to keep predicate updates SIMD-friendly and
+ // outside of predicated code paths -- this is not
+ // always how we would normally do things on SIMT but
+ // either approach is acceptable
+ //
+
+ //
+ // mask off lanes/components that successfully appended
+ //
+ is_active = is_active && !is_append;
+
+ //
+ // are there any active lanes left?
+ //
+ if (!skc_subgroup_any(is_active))
+ break;
+
+ //
+ // There are active lanes that couldn't be appended to a
+ // bin because their hashes collided with the bin's
+ // current ryx key then those bins must be ejected.
+ //
+ // Note that we do not eject "full" bins because lazily
+ // waiting for a collision results in simpler code.
+ //
+ skc_flush(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ subblocks,
+ blocks,
+ blocks_next,
+ sk_v,
+ sk_v_next,
+ sk_extent,
+ smem,
+ hash,
+ yx,
+ is_active);
+ }
+ }
+ }
+}
+
+//
+// INITIALIZE SMEM
+//
+// Note that SIMD/SIMT have nearly the same syntax.
+//
+static
+void
+skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
+{
+ //
+ // initialize smem bins
+ //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT );
+ smem->bin.vN.count = ( 0 );
+#else
+ //
+ // SIMT
+ //
+ int idx = skc_subgroup_lane();
+
+#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+ if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
+#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+ for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
+#endif
+ {
+ smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT );
+ smem->bin.aN.count[idx] = ( 0 );
+ }
+#endif
+}
+
+//
+// RASTERIZE CUBIC KERNEL
+//
+
+static
+void
+skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __local struct skc_subgroup_smem volatile * const smem,
+
+ skc_uint * const nodeword,
+ skc_block_id_t * const id,
+
+ union skc_transform const * const tv,
+ union skc_path_clip const * const cv,
+ skc_uint const cohort)
+{
+ //
+ // the initial segment idx and segments-per-block constant determine
+ // how many block ids will need to be loaded
+ //
+ SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ //
+ // apply transform
+ //
+ // note that we only care if the end points are rounded to subpixel precision
+ //
+ // FIXME -- transformation is currently affine-only support perspective later
+ //
+ // the affine transformation requires 8 FMA + 2 ROUND operations
+ //
+ SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
+
+ SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx;
+ SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty;
+
+ SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx;
+ SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty;
+
+ SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty);
+
+ //
+ //
+ //
+#if PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \
+ " { %.02f, %.02f }, { %.02f, %.02f } },\n", \
+ b0x C,b0y C,t1x C,t1y C, \
+ t2x C,t2y C,t3x C,t3y C);
+
+ SKC_RASTERIZE_VECTOR_EXPAND();
+
+#else
+
+ printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
+ b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+#endif
+
+#endif
+
+ //
+ // OLD APPROACH
+ // ------------
+ //
+ // The Spinel CUDA rasterizer was significantly more complex and
+ // performed a few different tasks that are probably best kept
+ // separate.
+ //
+ // The Spinel rasterizer Bezier held 4-element x and y coordinates
+ // in adjacent lanes. This simplified intermingling of single lane
+ // 4-coordinate line segments with two-lane cubic Beziers.
+ //
+ // After transformation of the input segments, the Spinel rasterizer
+ // would test cubics for flatness and, if flat, collapse the
+ // adjacent lanes into a single line lane and an empty lane.
+ //
+ // Any lines would then be appended to a line queue.
+ //
+ // Any cubics would then be subdivided.
+ //
+ // The reclassification process would be repeated.
+ //
+ // NEW APPROACH
+ // ------------
+ //
+ // Assume we're only working with cubics in this kernel.
+ //
+ // Optimization: if the line segment is a special case -- a cusp,
+ // has 1+ inflections, or a loop -- it might be beneficial to
+ // subdivide the control cage 1+ times in order to separate the
+ // flatter segments the high-velocity region(s).
+ //
+ // This means we want to split using [a,b] formulation to _directly_
+ // subdivide producing a new control cage.
+ //
+ // Wang's Formula is still useful even if we subdivide once or twice
+ // as it's so cheap that it might give some useful hints about where
+ // the high-velocity sections of curve reside.
+ //
+ // But it seems like using Wang's and directly flattening to line
+ // segments without any subdivision is good enough for the limited
+ // set of test cases that I've tried.
+ //
+ // So... use Wang's Formula to estimate how many line segment are
+ // required to properly flatten the cubics.
+ //
+ // Then use inclusive/exclusive scans to put all the lanes to work:
+ //
+ // 1. segmenting cubics to line segments
+ //
+ // 2. slivering line segments into 1-pixel high line segments
+ //
+ // 3. slivering 1-pixel high line segments into 1-pixel wide line
+ // segments
+ //
+ // MORE BACKGROUND ON NEW APPROACH
+ // -------------------------------
+ //
+ // Two options for handling line segments:
+ //
+ // 1. append the line segments onto an SLM array until enough
+ // work has been accrued (Spinel does this)
+ //
+ // 2. immediately sliver the potentially multi-pixel line
+ // segments into subpixel lines
+ //
+ // The advantage of (1) is that it guarantees the slivering
+ // process will, on average, always be emitting a full subgroup
+ // of subpixel lines.
+ //
+ // The advantage of (2) is that it reduces code complexity and
+ // leaves more room for SLM tile bins. The difference between Spinel
+ // and Skia Compute is that Wang's Formula guarantees there will be
+ // a full subgroup of multi-pixel lines unless this is the final
+ // iteration of the warp of multi-pixel lines.
+ //
+ // Note that wider GPU architectures might benefit from (1) and
+ // other work accumulation strategies because it will minimize
+ // partial warp workloads in the final iteration of each stage. It
+ // also minimizes the sunk cost of the uniform control logic steps.
+ //
+ // So let's implement (2) for now...
+ //
+
+ //
+ // And... begin!
+ //
+ // Estimate how many line segments are in quad/cubic curve.
+ //
+ // Wang's Formula will return zero if the control points are
+ // collinear but we bump it up to 1.0f.
+ //
+ SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+ //
+ // if there are free registers then precalculate the reciprocal for
+ // each estimated segments since it will never change
+ //
+ SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+ //
+ // inclusive add scan of estimated line segments
+ // exclusive add scan of estimated line segments
+ // total number of estimated line segments
+ //
+ SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs);
+ SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs;
+ float s_rem = skc_subgroup_last_float(s_iss); // scalar
+
+ //
+ // Precompute cubic polynomial coefficients from transformed control
+ // cage so we can shuffle them in on each iteration of the outer
+ // loop and then evaluate the polynomial in Horner form.
+ //
+ // | 1 0 0 0 | | c0 |
+ // | | | |
+ // | -3 3 0 0 | | c1 |
+ // B(t) = [ 1 t^1 t^2 t^3 ] | | | |
+ // | 3 -6 3 0 | | c2 |
+ // | | | |
+ // | -1 3 -3 1 | | c3 |
+ //
+ //
+ SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL
+ SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL
+
+ SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL
+ SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL
+
+ SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
+ SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
+
+ //
+ // these values don't matter on the first iteration
+ //
+ SKC_RASTERIZE_FLOAT l1x_prev = 0;
+ SKC_RASTERIZE_FLOAT l1y_prev = 0;
+
+ //
+ // allocate and init in-register TTSK keys
+ //
+ skc_uint sk_v_next = 0;
+ skc_ttsk_v_t sk_v;
+
+ sk_v.hi = cohort;
+
+ //
+ // initialize smem
+ //
+ skc_smem_init(smem);
+
+ //
+ // initialize blocks / subblocks
+ //
+ skc_block_id_v_t blocks;
+ skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+ skc_block_id_t subblocks = 0;
+#endif
+
+ //
+ // loop until done
+ //
+ while (s_rem > 0)
+ {
+ //
+ // distribute work across lanes
+ //
+ SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+ //
+ // every lane has a fraction to work off of
+ //
+ // FIXME -- this gets expanded on SIMD
+ //
+ // if delta == 1 then this is the first lane
+ // if count == s_segs then this is the last lane
+ //
+ SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+ SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source);
+
+ SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+ SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count);
+
+ //
+ // init parametric t
+ //
+ SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+ //
+ // if last then override to a hard 1.0f
+ //
+ s_t = is_s_last ? 1.0f : s_t;
+
+ //
+ // decrement by subgroup size
+ //
+ s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+ //
+ // now every lane knows what to do and the following lines will
+ // pump out up to SUBGROUP_SIZE line segments
+ //
+ // obtain the src vertices through shared or via a shuffle
+ //
+
+ //
+ // shuffle in the polynomial coefficients their source lane
+ //
+ SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+ SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+ SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+ SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+ SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+ SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+ SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
+ SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
+
+ //
+ // calculate "right" line segment endpoint using Horner form
+ //
+ SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
+ SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
+
+ //
+ // shuffle up "left" line segment endpoint
+ //
+ // NOTE: Intel's shuffle_up is unique with its elegant
+ // "previous" argument so don't get used to it
+ //
+ SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+ SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+ //
+ // save previous right endpoint
+ //
+ l1x_prev = l1x;
+ l1y_prev = l1y;
+
+ //
+ // override shuffle up if this is the first line segment
+ //
+ l0x = select(l0x,s0x,is_s_first);
+ l0y = select(l0y,s0y,is_s_first);
+
+ //
+ // sliver lines
+ //
+ skc_sliver(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &subblocks,
+ &blocks,
+ &blocks_next,
+ &sk_v,
+ &sk_v_next,
+ sk_extent,
+ smem,
+ l0x,l0y,l1x,l1y);
+ }
+
+ //
+ // - flush work-in-progress blocks
+ // - return unused block ids
+ //
+ skc_finalize(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &blocks,
+ blocks_next,
+ &sk_v,
+ sk_v_next,
+ sk_extent,
+ smem);
+}
+
+//
+// RASTERIZE QUAD KERNEL
+//
+
+static
+void
+skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __local struct skc_subgroup_smem volatile * const smem,
+
+ skc_uint * const nodeword,
+ skc_block_id_t * const id,
+
+ union skc_transform const * const tv,
+ union skc_path_clip const * const cv,
+ skc_uint const cohort)
+{
+ //
+ // the initial segment idx and segments-per-block constant determine
+ // how many block ids will need to be loaded
+ //
+ SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ //
+ // apply transform
+ //
+ // note that we only care if the end points are rounded to subpixel precision
+ //
+ // FIXME -- transformation is currently affine-only support perspective later
+ //
+ // the affine transformation requires 8 FMA + 2 ROUND operations
+ //
+ SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
+
+ SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx;
+ SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty;
+
+ SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty);
+
+ //
+ // Estimate how many line segments are in quad/cubic curve.
+ //
+ // Wang's Formula will return zero if the control points are
+ // collinear but we bump it up to 1.0f.
+ //
+ SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
+
+ //
+ // if there are free registers then precalculate the reciprocal for
+ // each estimated segments since it will never change
+ //
+ SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+ //
+ // inclusive add scan of estimated line segments
+ // exclusive add scan of estimated line segments
+ // total number of estimated line segments
+ //
+ SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs);
+ SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs;
+ float s_rem = skc_subgroup_last_float(s_iss); // scalar
+
+ //
+ // Precompute quadratic polynomial coefficients from control cage so
+ // we can shuffle them in on each iteration of the outer loop and
+ // then evaluate the polynomial in Horner form.
+ //
+
+ // | 1 0 0 | | c0 |
+ // | | | |
+ // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 |
+ // | | | |
+ // | 1 -2 1 | | c2 |
+ //
+ //
+ SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
+ SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
+
+ SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD
+ SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD
+
+ //
+ // these values don't matter on the first iteration
+ //
+ SKC_RASTERIZE_FLOAT l1x_prev = 0;
+ SKC_RASTERIZE_FLOAT l1y_prev = 0;
+
+ //
+ // allocate and init in-register TTSK keys
+ //
+ skc_uint sk_v_next = 0;
+ skc_ttsk_v_t sk_v;
+
+ sk_v.hi = cohort;
+
+ //
+ // initialize smem
+ //
+ skc_smem_init(smem);
+
+ //
+ // initialize blocks / subblocks
+ //
+ skc_block_id_v_t blocks;
+ skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+ skc_block_id_t subblocks = 0;
+#endif
+
+ //
+ // loop until done
+ //
+ while (s_rem > 0)
+ {
+ //
+ // distribute work across lanes
+ //
+ SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+ //
+ // every lane has a fraction to work off of
+ //
+ // FIXME -- this gets expanded on SIMD
+ //
+ // if delta == 1 then this is the first lane
+ // if count == s_segs then this is the last lane
+ //
+ SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+ SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source);
+
+ SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+ SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count);
+
+ //
+ // init parametric t
+ //
+ SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+ //
+ // if last then override to a hard 1.0f
+ //
+ s_t = is_s_last ? 1.0f : s_t;
+
+ //
+ // decrement by subgroup size
+ //
+ s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+ s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+ //
+ // now every lane knows what to do and the following lines will
+ // pump out up to SUBGROUP_SIZE line segments
+ //
+ // obtain the src vertices through shared or via a shuffle
+ //
+
+ //
+ // shuffle in the polynomial coefficients their source lane
+ //
+ SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+ SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+ SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+ SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+ SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+ SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+ //
+ // calculate "right" line segment endpoint using Horner form
+ //
+ SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
+ SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
+
+ //
+ // shuffle up "left" line segment endpoint
+ //
+ // NOTE: Intel's shuffle_up is unique with its elegant
+ // "previous" argument so don't get used to it
+ //
+ SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+ SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+ //
+ // save previous right endpoint
+ //
+ l1x_prev = l1x;
+ l1y_prev = l1y;
+
+ //
+ // override shuffle up if this is the first line segment
+ //
+ l0x = select(l0x,s0x,is_s_first);
+ l0y = select(l0y,s0y,is_s_first);
+
+ //
+ // sliver lines
+ //
+ skc_sliver(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &subblocks,
+ &blocks,
+ &blocks_next,
+ &sk_v,
+ &sk_v_next,
+ sk_extent,
+ smem,
+ l0x,l0y,l1x,l1y);
+ }
+
+ //
+ // - flush work-in-progress blocks
+ // - return unused block ids
+ //
+ skc_finalize(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &blocks,
+ blocks_next,
+ &sk_v,
+ sk_v_next,
+ sk_extent,
+ smem);
+}
+
+//
+// RASTERIZE LINE KERNEL
+//
+
+static
+void
+skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __local struct skc_subgroup_smem volatile * const smem,
+
+ skc_uint * const nodeword,
+ skc_block_id_t * const id,
+
+ union skc_transform const * const tv,
+ union skc_path_clip const * const cv,
+ skc_uint const cohort)
+{
+ //
+ // the initial segment idx and segments-per-block constant determine
+ // how many block ids will need to be loaded
+ //
+ SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+ skc_segment_next(bp_elems,nodeword,id);
+
+ SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+#if 0
+ // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
+ printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
+#endif
+
+ //
+ // apply transform
+ //
+ // note that we only care if the end points are rounded to subpixel precision
+ //
+ // FIXME -- transformation is currently affine-only
+ // FIXME -- support perspective later
+ //
+ // the affine transformation requires 8 FMA + 4 ROUND operations
+ //
+ SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
+
+ SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx);
+ SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty);
+
+#if 0
+ printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
+#endif
+
+ //
+ // allocate and init in-register TTSK keys
+ //
+ skc_uint sk_v_next = 0;
+ skc_ttsk_v_t sk_v;
+
+ sk_v.hi = cohort;
+
+ //
+ // initialize smem
+ //
+ skc_smem_init(smem);
+
+ //
+ // initialize blocks / subblocks
+ //
+ skc_block_id_v_t blocks;
+ skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+ skc_block_id_t subblocks = 0;
+#endif
+
+ //
+ // sliver lines
+ //
+ skc_sliver(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &subblocks,
+ &blocks,
+ &blocks_next,
+ &sk_v,
+ &sk_v_next,
+ sk_extent,
+ smem,
+ l0x,l0y,l1x,l1y);
+
+ //
+ // - flush work-in-progress blocks
+ // - return unused block ids
+ //
+ skc_finalize(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ &blocks,
+ blocks_next,
+ &sk_v,
+ sk_v_next,
+ sk_extent,
+ smem);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ __local struct skc_subgroup_smem volatile smem[1];
+#else
+ __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+ __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // this is a subgroup/warp-centric kernel
+ //
+ // which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ uint const cmd_idx = get_group_id(0);
+#else
+ uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("+cmd_idx = %u\n",cmd_idx);
+#endif
+
+ //
+ // if worksgroups are multi-subgroup then there may be excess
+ // subgroups in the final workgroup
+ //
+ if (cmd_idx >= count)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("-cmd_idx = %u\n",cmd_idx);
+#endif
+
+ //
+ // load a single command for this subgroup
+ //
+ union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("[ %u ]< %u, %u, %u, %u >\n",
+ cmd_idx,
+ cmd.nodeword,
+ SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
+ SKC_CMD_RASTERIZE_GET_CLIP(cmd),
+ SKC_CMD_RASTERIZE_GET_COHORT(cmd));
+#endif
+
+ //
+ // get first block node command word and its subblock
+ //
+ skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
+ skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
+ skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
+ skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+ //
+ // load transform -- uniform across subgroup
+ //
+ // v8: { sx shx tx shy sy ty w0 w1 }
+ //
+ // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+ //
+ // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+ //
+ // Coordinates are scaled to subpixel resolution. All that matters
+ // is that continuity is maintained between end path element
+ // endpoints.
+ //
+ // It's the responsibility of the host to ensure that the transforms
+ // are properly scaled either via intitializing a transform stack
+ // with the subpixel resolution scaled identity or scaling the
+ // transform before its loaded by a rasterization grid.
+ //
+ // FIXME -- horizontal load might be better than this broadcast load
+ //
+ union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+ union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
+ skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+ switch (tag)
+ {
+ case SKC_BLOCK_ID_TAG_PATH_LINE:
+ skc_rasterize_lines(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+ break;
+
+ case SKC_BLOCK_ID_TAG_PATH_QUAD:
+ skc_rasterize_quads(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+ break;
+
+ case SKC_BLOCK_ID_TAG_PATH_CUBIC:
+ skc_rasterize_cubics(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+ break;
+
+ case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
+ break;
+ case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
+ break;
+
+ default:
+ break;
+ }
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ __local struct skc_subgroup_smem volatile smem[1];
+#else
+ __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+ __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // this is a subgroup/warp-centric kernel
+ //
+ // which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ uint const cmd_idx = get_group_id(0);
+#else
+ uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ //
+ // if worksgroups are multi-subgroup then there may be excess
+ // subgroups in the final workgroup
+ //
+ if (cmd_idx >= count)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+ //
+ // load a single command for this subgroup
+ //
+ union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+ //
+ // get first block node command word and its subblock
+ //
+ skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
+ skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
+ skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+ //
+ // load transform -- uniform across subgroup
+ //
+ // v8: { sx shx tx shy sy ty w0 w1 }
+ //
+ // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+ //
+ // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+ //
+ // Coordinates are scaled to subpixel resolution. All that matters
+ // is that continuity is maintained between end path element
+ // endpoints.
+ //
+ // It's the responsibility of the host to ensure that the transforms
+ // are properly scaled either via intitializing a transform stack
+ // with the subpixel resolution scaled identity or scaling the
+ // transform before its loaded by a rasterization grid.
+ //
+ // FIXME -- horizontal load might be better than this broadcast load
+ //
+ union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+ union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
+ skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+ skc_rasterize_lines(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ __local struct skc_subgroup_smem volatile smem[1];
+#else
+ __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+ __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // this is a subgroup/warp-centric kernel
+ //
+ // which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ uint const cmd_idx = get_group_id(0);
+#else
+ uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ //
+ // if worksgroups are multi-subgroup then there may be excess
+ // subgroups in the final workgroup
+ //
+ if (cmd_idx >= count)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+ //
+ // load a single command for this subgroup
+ //
+ union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+ //
+ // get first block node command word and its subblock
+ //
+ skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
+ skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
+ skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+ //
+ // load transform -- uniform across subgroup
+ //
+ // v8: { sx shx tx shy sy ty w0 w1 }
+ //
+ // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+ //
+ // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+ //
+ // Coordinates are scaled to subpixel resolution. All that matters
+ // is that continuity is maintained between end path element
+ // endpoints.
+ //
+ // It's the responsibility of the host to ensure that the transforms
+ // are properly scaled either via intitializing a transform stack
+ // with the subpixel resolution scaled identity or scaling the
+ // transform before its loaded by a rasterization grid.
+ //
+ // FIXME -- horizontal load might be better than this broadcast load
+ //
+ union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+ union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
+ skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+ skc_rasterize_quads(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ __local struct skc_subgroup_smem volatile smem[1];
+#else
+ __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+ __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // this is a subgroup/warp-centric kernel
+ //
+ // which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+ uint const cmd_idx = get_group_id(0);
+#else
+ uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ //
+ // if worksgroups are multi-subgroup then there may be excess
+ // subgroups in the final workgroup
+ //
+ if (cmd_idx >= count)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+ //
+ // load a single command for this subgroup
+ //
+ union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+ //
+ // get first block node command word and its subblock
+ //
+ skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
+ skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
+ skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+ //
+ // load transform -- uniform across subgroup
+ //
+ // v8: { sx shx tx shy sy ty w0 w1 }
+ //
+ // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+ //
+ // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+ //
+ // Coordinates are scaled to subpixel resolution. All that matters
+ // is that continuity is maintained between end path element
+ // endpoints.
+ //
+ // It's the responsibility of the host to ensure that the transforms
+ // are properly scaled either via intitializing a transform stack
+ // with the subpixel resolution scaled identity or scaling the
+ // transform before its loaded by a rasterization grid.
+ //
+ // FIXME -- horizontal load might be better than this broadcast load
+ //
+ union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+ union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
+ skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+ skc_rasterize_cubics(bp_atomics,
+ bp_elems,
+ bp_ids,
+ bp_mask,
+ cohort_atomics,
+ sk_extent,
+ smem,
+ &nodeword,&id,
+ &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ ;
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global union skc_bp_elem * const bp_elems,
+ __global uint * const bp_ids,
+ skc_uint const bp_mask,
+
+ __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
+ __global skc_ttsk_s_t * const sk_extent,
+
+ __global float8 const * const transforms, // FIXME -- __constant
+ __global float4 const * const clips, // FIXME -- __constant
+ __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
+ skc_uint const count)
+{
+ ;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/rasters_alloc.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl index f8f76a7b39..0c7da7d0ad 100644 --- a/src/compute/skc/rasters_alloc.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl @@ -1,144 +1,144 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "raster_builder_cl_12.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "raster.h" -#include "tile.h" - -// -// There is a fixed-size meta table per raster cohort that we use to -// peform a mostly coalesced sizing and allocation of blocks. -// -// This code is simple and fast. -// - -__kernel -SKC_RASTERS_ALLOC_KERNEL_ATTRIBS -void -skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics, - __global skc_block_id_t const * const bp_ids, - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t * const map, - __global skc_uint * const metas, - __global skc_uint const * const raster_ids, // FIXME -- CONSTANT - skc_uint const count) -{ - // access to the meta extent is linear - skc_uint const gid = get_global_id(0); - skc_bool const is_active = gid < count; - - // - // init with defaults for all lanes - // - union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } }; - skc_uint raster_id = SKC_UINT_MAX; - skc_uint extra_blocks = 0; - - if (is_active) - { - // load meta_in - meta.in.u32v4 = vload4(gid,metas); - - // load raster_id as early as possible - raster_id = raster_ids[gid]; - -#if 0 - printf("%3u + %5u, %5u, %5u, %5u\n", - gid, - meta.in.blocks, - meta.in.offset, - meta.in.pk, - meta.in.rk); -#endif - - // how many blocks will the ttpb blocks consume? - extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / - SKC_DEVICE_SUBBLOCKS_PER_BLOCK); - - // total keys - meta.out.keys += meta.in.pk; - - // how many blocks do we need to store the keys in the head and trailing nodes? - skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) / - (SKC_RASTER_NODE_DWORDS - 1)); - // increment blocks - extra_blocks += hn; - - // how many nodes trail the head? - meta.out.nodes = hn - 1; - - // update blocks - meta.out.blocks += extra_blocks; - -#if 0 - printf("%3u - %5u, %5u, %5u, %5u\n", - gid, - meta.out.blocks, - meta.out.offset, - meta.out.nodes, - meta.out.keys); -#endif - } - - // - // allocate blocks from block pool - // - // first perform a prefix sum on the subgroup to reduce atomic - // operation traffic - // - // note this idiom can be implemented with vectors, subgroups or - // workgroups - // - - skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks); - skc_uint reads = 0; - - // last lane performs the block pool allocation with an atomic increment - if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) { - reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads - } - - // broadcast block pool base to all lanes - reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1); - - // update base for each lane - reads += prefix - extra_blocks; - - // - // store meta header - // - if (is_active) - { - // store headers back to meta extent - vstore4(meta.out.u32v4,gid,metas); - - // store reads - metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; - - // get block_id of each raster head - skc_block_id_t const block_id = bp_ids[reads & bp_mask]; - - // update map - map[raster_id] = block_id; - -#if 0 - printf("alloc: %u / %u\n",raster_id,block_id); -#endif - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+// There is a fixed-size meta table per raster cohort that we use to
+// peform a mostly coalesced sizing and allocation of blocks.
+//
+// This code is simple and fast.
+//
+
+__kernel
+SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+ __global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t * const map,
+ __global skc_uint * const metas,
+ __global skc_uint const * const raster_ids, // FIXME -- CONSTANT
+ skc_uint const count)
+{
+ // access to the meta extent is linear
+ skc_uint const gid = get_global_id(0);
+ skc_bool const is_active = gid < count;
+
+ //
+ // init with defaults for all lanes
+ //
+ union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
+ skc_uint raster_id = SKC_UINT_MAX;
+ skc_uint extra_blocks = 0;
+
+ if (is_active)
+ {
+ // load meta_in
+ meta.in.u32v4 = vload4(gid,metas);
+
+ // load raster_id as early as possible
+ raster_id = raster_ids[gid];
+
+#if 0
+ printf("%3u + %5u, %5u, %5u, %5u\n",
+ gid,
+ meta.in.blocks,
+ meta.in.offset,
+ meta.in.pk,
+ meta.in.rk);
+#endif
+
+ // how many blocks will the ttpb blocks consume?
+ extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
+ SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
+
+ // total keys
+ meta.out.keys += meta.in.pk;
+
+ // how many blocks do we need to store the keys in the head and trailing nodes?
+ skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
+ (SKC_RASTER_NODE_DWORDS - 1));
+ // increment blocks
+ extra_blocks += hn;
+
+ // how many nodes trail the head?
+ meta.out.nodes = hn - 1;
+
+ // update blocks
+ meta.out.blocks += extra_blocks;
+
+#if 0
+ printf("%3u - %5u, %5u, %5u, %5u\n",
+ gid,
+ meta.out.blocks,
+ meta.out.offset,
+ meta.out.nodes,
+ meta.out.keys);
+#endif
+ }
+
+ //
+ // allocate blocks from block pool
+ //
+ // first perform a prefix sum on the subgroup to reduce atomic
+ // operation traffic
+ //
+ // note this idiom can be implemented with vectors, subgroups or
+ // workgroups
+ //
+
+ skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
+ skc_uint reads = 0;
+
+ // last lane performs the block pool allocation with an atomic increment
+ if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
+ reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
+ }
+
+ // broadcast block pool base to all lanes
+ reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
+
+ // update base for each lane
+ reads += prefix - extra_blocks;
+
+ //
+ // store meta header
+ //
+ if (is_active)
+ {
+ // store headers back to meta extent
+ vstore4(meta.out.u32v4,gid,metas);
+
+ // store reads
+ metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
+
+ // get block_id of each raster head
+ skc_block_id_t const block_id = bp_ids[reads & bp_mask];
+
+ // update map
+ map[raster_id] = block_id;
+
+#if 0
+ printf("alloc: %u / %u\n",raster_id,block_id);
+#endif
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl index f0abdb0381..27411cfe96 100644 --- a/src/compute/skc/rasters_reclaim.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl @@ -1,442 +1,442 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "block_pool_cl.h" -#include "atomic_cl.h" -#include "block.h" -#include "raster.h" -#include "common.h" -#include "tile.h" - -// -// -// - -#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS) - -#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS) - -// -// -// - -#if ( SKC_RASTERS_RECLAIM_X == 1 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0 - -#elif ( SKC_RASTERS_RECLAIM_X == 2 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1 - -#elif ( SKC_RASTERS_RECLAIM_X == 4 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3 - -#elif ( SKC_RASTERS_RECLAIM_X == 8 ) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7 - -#elif ( SKC_RASTERS_RECLAIM_X == 16) -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() -#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15 - -#else -#error "MISSING SKC_RASTERS_RECLAIM_X" -#endif - -#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \ - (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 - -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask - -#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) -#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) -#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) - -#endif - -// -// FIXME -- slate these for replacement -// - -#define SKC_BROADCAST(E,S,I) \ - sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_BROADCAST_LAST_HELPER(E,I) \ - sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - -#define SKC_BROADCAST_LAST(E,I) \ - SKC_BROADCAST_LAST_HELPER(E,I) - -// -// COMPILE-TIME PREDICATES -// - -#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \ - SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \ - (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \ - (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) - -#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \ - SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I) - -#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \ - SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I) - -// -// RUN-TIME PREDICATES -// - -#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \ - (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS) - -// -// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL -// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK -// COMBOS (NOT NECESSARILY POW2) -// -// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR -// UINT TYPE INSTEAD OF A ULONG. -// - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint - -// -// -// - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ - (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ - ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ - S = sub_group_scan_exclusive_add(C) - -#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \ - (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK) - -// -// -// - -struct skc_reclaim -{ - skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE]; -}; - -__kernel -SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS -void -skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring - __global skc_uint * const bp_elems, // block pool blocks - __global skc_uint volatile * const bp_atomics, // read/write atomics - skc_uint const bp_mask, // pow2 modulo mask for block pool ring - __global skc_block_id_t const * const map, // raster host-to-device map - struct skc_reclaim const reclaim) // array of host raster ids -{ -#if (__OPENCL_VERSION__ < 200) - skc_uint const reclaim_stride = get_num_sub_groups(); -#else - skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups -#endif - skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); - -#if 0 - // - // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT - // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL - // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE - // RECLAMATION JOB ON THE REST OF THE PIPELINE. - // - for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) -#endif - { - // get host raster id - skc_raster_h const raster = reclaim.aN[reclaim_idx]; - - // get block id of raster header - skc_block_id_t id = map[raster]; - - // - // load all of the head block ttxk.lo keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // pick out count.nodes and count.prims from the header - // - // load raster header counts -- we only need the blocks and - // nodes words the keys are doublewords. - // - // FIXME -- this can be made portable with compile-time macro expansion - // - skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES - skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS - -#if 0 - if (get_sub_group_local_id() == 0) { - printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes); - } -#endif - // - // acquire a span in the block pool ids ring for reclaimed ids - // - skc_uint bp_ids_base = 0; - - if (get_sub_group_local_id() == 0) { - bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); - } - - bp_ids_base = sub_group_broadcast(bp_ids_base,0); - - // - // mask off everything but the block id - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - h##I = h##I & SKC_TTXK_LO_MASK_ID; \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; -#if 0 - printf("rasters next = %u\n",id); -#endif - } - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%08X %u\n",h##I,h##I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - printf("%08X\n",h##I); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - - // - // - we'll skip subgroups that are entirely header - // - // - but we need to mark any header elements that partially fill - // a subgroup as subblocks - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \ - if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \ - h##I = SKC_UINT_MAX; \ - } \ - } \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - { - // - // count reclaimable blocks in each lane - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ - skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = h##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - } - - // printf("R %7u ! %u\n",bp_ids_idx,h##I); - - // - // we're done if it was just the header - // - if (count_nodes == 0) - return; - - // - // otherwise, walk the nodes - // - do { - // id of next block is in last lane - id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); - - // - // load all of the node block ttxk.lo keys into registers - // - // FIXME -- this pattern lends itself to using the higher - // performance Intel GEN block load instructions - // - skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // mask off everything but the block id - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - n##I = n##I & SKC_TTXK_LO_MASK_ID; - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // swap current id with next - // - if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) - { - skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); - - SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; - - id = next; -#if 0 - printf("rasters next = %u\n",id); -#endif - } - -#if 0 -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - printf("%08X %u\n",n##I,n##I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); -#endif - - // - // count reclaimable blocks in each lane - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) \ - packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // - // scan to find index of each block - // - SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); - - SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); - - // - // store blocks back to ring - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,R) { \ - skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ - skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ - skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ - if (count > 0) { \ - bp_ids[bp_ids_idx] = n##I; \ - } \ - skc_uint const total = index + count; \ - bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ - } - - SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); - - // printf("R %7u ! %u\n",bp_ids_idx,n##I); - - // any more nodes? - } while (--count_nodes > 0); - } -} - -// -// -// +/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
+
+//
+//
+//
+
+#if ( SKC_RASTERS_RECLAIM_X == 1 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
+
+#elif ( SKC_RASTERS_RECLAIM_X == 2 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
+
+#elif ( SKC_RASTERS_RECLAIM_X == 4 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
+
+#elif ( SKC_RASTERS_RECLAIM_X == 8 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
+
+#elif ( SKC_RASTERS_RECLAIM_X == 16)
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
+
+#else
+#error "MISSING SKC_RASTERS_RECLAIM_X"
+#endif
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
+ (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
+
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I) \
+ sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I) \
+ sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I) \
+ SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
+ SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
+ (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
+ (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
+ SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
+
+#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
+ SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
+ (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
+ (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
+ ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
+ S = sub_group_scan_exclusive_add(C)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
+ (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+ skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
+ __global skc_uint * const bp_elems, // block pool blocks
+ __global skc_uint volatile * const bp_atomics, // read/write atomics
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const map, // raster host-to-device map
+ struct skc_reclaim const reclaim) // array of host raster ids
+{
+#if (__OPENCL_VERSION__ < 200)
+ skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+ skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+ skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+ //
+ // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+ // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+ // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+ // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+ //
+ for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+ {
+ // get host raster id
+ skc_raster_h const raster = reclaim.aN[reclaim_idx];
+
+ // get block id of raster header
+ skc_block_id_t id = map[raster];
+
+ //
+ // load all of the head block ttxk.lo keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // pick out count.nodes and count.prims from the header
+ //
+ // load raster header counts -- we only need the blocks and
+ // nodes words the keys are doublewords.
+ //
+ // FIXME -- this can be made portable with compile-time macro expansion
+ //
+ skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+ skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
+ }
+#endif
+ //
+ // acquire a span in the block pool ids ring for reclaimed ids
+ //
+ skc_uint bp_ids_base = 0;
+
+ if (get_sub_group_local_id() == 0) {
+ bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+ }
+
+ bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+ //
+ // mask off everything but the block id
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ h##I = h##I & SKC_TTXK_LO_MASK_ID; \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+#if 0
+ printf("rasters next = %u\n",id);
+#endif
+ }
+
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%08X %u\n",h##I,h##I);
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ printf("%08X\n",h##I); \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+ //
+ // - we'll skip subgroups that are entirely header
+ //
+ // - but we need to mark any header elements that partially fill
+ // a subgroup as subblocks
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
+ if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
+ h##I = SKC_UINT_MAX; \
+ } \
+ } \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ {
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = h##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+ }
+
+ // printf("R %7u ! %u\n",bp_ids_idx,h##I);
+
+ //
+ // we're done if it was just the header
+ //
+ if (count_nodes == 0)
+ return;
+
+ //
+ // otherwise, walk the nodes
+ //
+ do {
+ // id of next block is in last lane
+ id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
+
+ //
+ // load all of the node block ttxk.lo keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // mask off everything but the block id
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ n##I = n##I & SKC_TTXK_LO_MASK_ID;
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+#if 0
+ printf("rasters next = %u\n",id);
+#endif
+ }
+
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%08X %u\n",n##I,n##I);
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = n##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+ // printf("R %7u ! %u\n",bp_ids_idx,n##I);
+
+ // any more nodes?
+ } while (--count_nodes > 0);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl index ba2fd7bbfd..9205334940 100644 --- a/src/compute/skc/render.cl +++ b/src/compute/skc/platforms/cl_12/kernels/render.cl @@ -1,2165 +1,2165 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include "device_cl_12_gen9.h" -#include "block.h" -#include "tile.h" -#include "atomic_cl.h" -#include "styling_types.h" - -// -// -// - -#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) - -// -// -// - -#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() -#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 -#endif - -// -// tile state flag bits -// - -typedef enum skc_tile_flags_e { - - // FLUSH - SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, - SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, - SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, - - // OPACITY - SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, - - // - // Note: testing for opacity and skipping scattering is on its way - // to becoming a much more programmable option because sometimes we - // may be compositing/blending from back-to-front and/or be using - // group blend rules that ignore opacity. - // - // The point is that all of these decisions should be encoded in - // styling commands and, as much as possible, removed from the final - // group/layer styling traversal render loop. - // - -} skc_tile_flags_e; - -// -// COVER -- assumes availability of either fp16 or fp32 -// - -union skc_tile_cover -{ - struct { - SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; - } aN; - -#ifdef SKC_RENDER_TILE_COVER_VECTOR - struct { - SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; - } vN; -#endif -}; - -// -// COLOR -- assumes availability of either fp16 or fp32 -// - -union skc_tile_color -{ - union { - struct { - SKC_RENDER_TILE_COLOR r; - SKC_RENDER_TILE_COLOR g; - SKC_RENDER_TILE_COLOR b; - SKC_RENDER_TILE_COLOR a; - } rgba[SKC_TILE_WIDTH]; - } aN; - -#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED - union { - SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; - } iN; -#endif - -#ifdef SKC_RENDER_TILE_COLOR_VECTOR - union { - SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; - } vN; -#endif - - struct { - union { - struct { - SKC_RENDER_TILE_COLOR r; - SKC_RENDER_TILE_COLOR g; - }; - SKC_RENDER_GRADIENT_FLOAT distance; - }; - union { - struct { - SKC_RENDER_TILE_COLOR b; - SKC_RENDER_TILE_COLOR a; - }; - SKC_RENDER_GRADIENT_FLOAT stoplerp; - }; - } grad[SKC_TILE_WIDTH]; -}; - -// -// SHARED MEMORY STATE -// - -#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) - -#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) -#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) - -// -// -// - -union skc_subgroup_smem -{ - // - // The tiles are stored in column-major / height-major order - // - // The final column is a guard column that is OK to write to but - // will never be read. It simplifies the TTSB scatter but could be - // predicated if SMEM is really at a premium. - // -#if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) - struct { - SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] - } atomic; -#endif - - struct { - int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] - } aN; - - struct { // assumption is that height = subgroup - SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; - } vN; - - struct { // assumption is that height = subgroup - SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; - } wide; - - union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; - - half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; - -#if 0 - // - // SPILL TO GMEM - // -#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) - struct { - -#if (SKC_REGS_COLOR_S > 0) - union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; -#endif - -#if (SKC_REGS_COVER_S > 0) - union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; -#endif - - } regs; -#endif - // - // - // -#endif -}; - -// -// -// - -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - -#define skc_subgroup_lane() 0 - -#else - -#define skc_subgroup_lane() get_sub_group_local_id() - -#endif - -// -// -// - -typedef skc_uint skc_ttsk_lo_t; -typedef skc_uint skc_ttsk_hi_t; - -typedef skc_uint skc_ttpk_lo_t; -typedef skc_uint skc_ttpk_hi_t; - -typedef skc_uint skc_ttxk_lo_t; -typedef skc_uint skc_ttxk_hi_t; - -typedef skc_uint skc_ttck_lo_t; -typedef skc_uint skc_ttck_hi_t; - -typedef skc_uint2 skc_ttck_t; - -typedef skc_int skc_ttxb_t; - -// -// TTCK (32-BIT COMPARE) v1: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 18 | 7 | 7 | -// -// -// TTCK (32-BIT COMPARE) v2: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 30 | 1 | 1 | 15 | 9 | 8 | -// -// -// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: -// -// 0 63 -// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | -// +----------------------+--------+--------+-------+-----+-----+ -// | 27 | 1 | 1 | 18 | 9 | 8 | -// - -static -skc_uint -skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) -{ - return a & SKC_TTCK_LO_MASK_ID; -} - -static -skc_layer_id -skc_ttck_get_layer(skc_ttck_t const a) -{ - // - // FIXME -- a union with a ulong and a shift down and mask is - // probably faster on some architectures - // - skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); - skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; - - return lo | hi; -} - -static -skc_uint -skc_ttck_hi_get_x(skc_ttck_hi_t const a) -{ - return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); -} - -static -skc_uint -skc_ttck_hi_get_y(skc_ttck_hi_t const a) -{ - return a >> SKC_TTCK_HI_OFFSET_Y; -} - -static -skc_bool -skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) -{ - skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); - skc_uint const hi = (a.hi ^ b.hi); - - return (lo | hi) == 0; -} - -static -skc_bool -skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) -{ - return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; -} - -static -skc_bool -skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) -{ - return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; -} - -// -// TILE TRACE SUBPIXEL -// -// The subpixels are encoded with either absolute tile coordinates -// (32-bits) or packed in delta-encoded form form. -// -// For 32-bit subpixel packing of a 32x32 tile: -// -// A tile X is encoded as: -// -// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. -// -// SX : 6 : unsigned subpixel span from min to max x with range -// [0,32]. The original direction is not captured. Would -// be nice to capture dx but not necessary right now but -// could be in the future. <--- SPARE VALUES AVAILABLE -// -// A tile Y is encoded as: -// -// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. -// -// DY : 6 : signed subpixel delta y1-y0. The range of delta is -// [-32,32] but horizontal lines are not encoded so [1,32] -// is mapped to [0,31]. The resulting range [-32,31] fits -// in 6 bits. -// -// TTS: -// -// 0 31 -// | TX | SX | TY | DY | -// +-----+------+-----+------+ -// | 10 | 6 | 10 | 6 | -// - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) -{ - // - // extract the whole pixel y coordinate - // - return SKC_BFE(a, - SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, - SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); -} - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) -{ - // - // get the linear array tile index of the pixel - // - return (((a & SKC_TTS_MASK_TX_PIXEL) - -#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) - >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) -#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) - << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) -#endif - - ) | skc_tts_get_ty_pixel_v(a)); -} - -#if 0 -static -skc_ttx_v_s32_t -skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) -{ - skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; - - return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); -} -#else -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) -{ - SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; - - return dy - (~a >> 31); -} -#endif - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) -{ - return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); -} - -static -SKC_RENDER_TTS_V_BITFIELD -skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) -{ - return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); -} - -// -// -// - -static -void -skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) -{ - // - // SIMD / CPU - // - // & - // - // SIMT / GPU - // - // Note that atomic_init() is likely implemented as a simple - // assignment so there is no identifiable performance difference on - // current targets. - // - // If such an architecture appears in the future then we'll probably - // still want to implement this zero'ing operation as below but - // follow with an appropriate fence that occurs before any scatter - // operations. - // - // The baroque expansion below improves performance on Intel GEN by, - // presumably, achieving the 64-byte per clock SLM write as well as - // minimizing the overall number of SEND() block initializations and - // launches. - // - // Intel GENx has a documented 64 byte per cycle SLM write limit. - // So having each lane in an 8 lane subgroup zero-write 8 bytes is - // probably a safe bet (Later: benchmarking backs this up!). - // - // Note there is no reason at this time to unroll this loop. - // - for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++) - smem->wide.area[ii][skc_subgroup_lane()] = ( 0 ); -} - -// -// Note this is going to be vectorizable on most architectures. -// -// The return of the key translation feature might complicate things. -// - -static -void -skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, - __local union skc_subgroup_smem * SKC_RESTRICT const smem, - skc_block_id_t const pb_id) -{ - skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); - -#if ( SKC_TILE_RATIO == 1 ) - - SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; - -#elif ( SKC_TILE_RATIO == 2 ) - - SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); - -#else - -#error("tile ratio greater than 2 not supported") - -#endif - - // - // Note there is no need to use an atomic for this operation on the - // current group of target platforms... but this may change if - // atomic ops truly go through a different path. - // - // As noted above, this direct increment is probably faster and can - // always be followed by a fence. - // - // Furthermore, note that the key sorting orders all ttck keys - // before ttpk keys. - // - - // - // FIXME -- if the SMEM store is wider than bank word count then we - // might want to odd-even interleave the TTP values if the target - // device can't handle 64-bit stores - // - - // - // skipping per-key translation for now - // - smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); -} - -// -// Note that skc_scatter_ttsb is *not* vectorizable unless the -// architecture supports a "scatter-add" capability. All relevant -// GPUs support atomic add on shared/local memory and thus support -// scatter-add. -// - -static -void -skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, - __local union skc_subgroup_smem * SKC_RESTRICT const smem, - skc_block_id_t const sb_id) -{ - skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); - - SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; - - // - // Skipping per-key translation for now - // - - // Index into tile - // - // The tiles are stored in column-major / height-major order - // - // The final column is a guard column that is OK to write to but - // will never be read. It simplifies the TTSB scatter but could be - // predicated if SMEM is really at a premium. - // - - SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); - -#if 0 - if (tts_v != SKC_TTS_INVALID) - printf("(%08X) = %u\n",tts_v,xy_idx); -#endif - - // - // adjust subpixel range to max y - // - // range is stored as [-32,31] and when read [0,31] is mapped to - // [1,32] because a dy of 0 is not possible. - // - // more succinctly: if dy >= 0 then ++dy - // - SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); - - // - // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? - // - - // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" - SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); - - // Calculate left and right coverage contribution trapezoids - SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; - SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; - - // - // Accumulate altitudes and areas - // - // Optimization: if the device supports an CPU/SIMD vector-add or - // GPU/SIMT scatter-add atomic int2 add operation then placing the - // ALT and AREA values side-by-side would halve the number of - // additions. - // -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - // - // CPU/SIMD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ - smem->aN.area[ xy_idx C] += right C; \ - } - -#else - // - // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD - // -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) \ - if (tts_v C != SKC_TTS_INVALID) { \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ - SKC_TILE_HEIGHT + xy_idx C, \ - left C); \ - SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ - right C); \ - } -#endif - - SKC_RENDER_TTSB_EXPAND(); -} - -// -// Note that 2048.0 can be represented exactly with fp16... fortuitous! -// - -#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) -#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) -#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) -#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) - -// -// -// - -static -void -skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - union skc_tile_cover * SKC_RESTRICT const cover, - union skc_tile_color * SKC_RESTRICT const color) -{ - SKC_RENDER_ACC_COVER_INT area = 0; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - area += smem->vN.area[ii][skc_subgroup_lane()]; - SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); - SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); - - cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); - } -} - -static -void -skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - union skc_tile_cover * SKC_RESTRICT const cover, - union skc_tile_color * SKC_RESTRICT const color) -{ - SKC_RENDER_ACC_COVER_INT area = 0; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - area += smem->vN.area[ii][skc_subgroup_lane()]; - SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); - SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); - - cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; - } -} - -// -// -// - -static -void -skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color) -{ - // - // rgba = solid fill - // - __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; - - *cmd_next += 2; - -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].r = rg.lo; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].g = rg.hi; - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].b = ba.lo; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].a = ba.hi; - -#else - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - SKC_RENDER_TILE_COLOR const r = rg.lo; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); - - SKC_RENDER_TILE_COLOR const g = rg.hi; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - SKC_RENDER_TILE_COLOR const b = ba.lo; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); - - SKC_RENDER_TILE_COLOR const a = ba.hi; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); - -#endif -} - -// -// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" -// -// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ -// -// Lerp in two fma/mad ops: -// -// t * b + ((-t) * a + a) -// -// Note: OpenCL documents mix() as being implemented as: -// -// a + (b - a) * t -// -// But this may be a native instruction on some devices. For example, -// on GEN9 there is an LRP "linear interoplation" function but it -// doesn't appear to support half floats. -// - -#if 1 -#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) -#else -#define SKC_LERP(a,b,t) mix(a,b,t) -#endif - -// -// CPUs have a mock local address space so copying the gradient header -// is probably not useful. Just read directly from global. -// - -#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL -#define SKC_RENDER_GRADIENT_SPACE __local -#else -#define SKC_RENDER_GRADIENT_SPACE __global -#endif - -// -// gradient is non-vertical -// -// removed the vertical (actually, horizontal) special case -// - -static -void -skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, - __global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // Where is this tile? - // - // Note that the gradient is being sampled from pixel centers. - // - SKC_RENDER_GRADIENT_FLOAT const y = -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P - (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + - (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); - - float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); - - // - // Get starting numerator and denominator - // - // Note: if gh[0].dx is exactly 0.0f then this is a vertical - // gradient and can be handled by a special opcode. - // - // Note: the mad() ordering is slightly different than the original - // CUDA implementation. - // - union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; - - *cmd_next += 4; - - float const gv_x_dot = mad(x,gv.dx,gv.p0); - SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); - - // - // Where are columns along gradient vector? - // - // TODO: Note that the gv_denom isn't multiplied through. - // - // Please doublecheck this... but I recall that in certain cases - // this wipes out some precision and results in minor but noticeable - // gradient artifacts. - // - // All arguments are scalars except gv_numer so a simpler - // evaluation might save some flops. - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; - - // - // is gradient non-repeating, repeating or reflecting? - // - switch (commands[(*cmd_next)++].u32) - { - case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); - break; - - case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->grad[ii].distance -= floor(color->grad[ii].distance); - break; - - default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING - // - // OPTIMIZATION: Can this be done in fewer than ~4 ops? - // - // Note: OpenCL "rint()" is round-to-nearest-even integer! - // - // Note: the floor() "round to -inf" op is implemented in the - // GEN op 'FRC' so probably don't use trunc() when floor will - // suffice. - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance); - color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); - } - } - - // - // initialize "stoplerp" for all columns - // - uint const slope_count = commands[(*cmd_next)++].u32; - uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME - - { - float const slope = commands[(*cmd_next)++].f32; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->grad[ii].stoplerp = color->grad[ii].distance * slope; - } - - // - // compute stoplerp for remaining stops - // - for (int jj=1; jj<slope_count; jj++) - { - float const floor = (float)jj; - float const slope = commands[(*cmd_next)++].f32; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); - } - - // - // copy gradient colors to local memory - // - uint const gd_n = slope_count + 1; - -#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL - // - // copy entire gradient descriptor to local memory - // - for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE) - smem->cmds[ii].u32 = commands[*cmd_next + ii].u32; - - __local half const * const SKC_RESTRICT gc = smem->gc + 0; -#else - // - // prefetch entire gradient header - // - // no noticeable impact on performance - // - // prefetch(&commands[*cmd_next].u32,gh_words); - // - __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; -#endif - - // - // adjust cmd_next so that V1 structure is consumed -- FIXME - // - *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); - - // - // lerp between color pair stops - // - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - // - // Finally, we have the gradient stop index and the color stop - // pair lerp fraction - // - // Note that if these are vector values then a gather operation - // must occur -- there may be platforms (AVX-512?) that can - // perform an explicit gather on a vector type but it's not - // really expressible in OpenCL except implicitly with a - // workgroup of work items. - // - // *********************** - // - // FIXME -- USE HERB'S SINGLE FMA LERP - // - // *********************** - // - SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp); - SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); - - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); - } - - // - // - // - { - SKC_RENDER_TILE_COLOR lo, hi; - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ - lo C = cc.lo; \ - hi C = cc.hi; \ - } - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); - } - } -} - -// -// -// - -static -void -skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // fralunco = cover.wip * acc.a - // - // acc.r = fralunco * wip.r + acc.r - // acc.g = fralunco * wip.g + acc.g - // acc.b = fralunco * wip.b + acc.b - // acc.a = -fralunco * wip.a + acc.a - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a; - - color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // cover_min = min(cover.wip,a.acc) - // - // r.acc = cover_min * r.wip + r.acc - // g.acc = cover_min * g.wip + g.acc - // b.acc = cover_min * b.wip + b.acc - // a.acc = -cover_min * a.wip + a.acc - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a); - - color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // r.acc = (cover.wip * r.wip) * r.acc - // g.acc = (cover.wip * g.wip) * g.acc - // b.acc = (cover.wip * b.wip) * b.acc - // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; - color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; - color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; - color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; - } -} - -// -// -// - -static -void -skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, - union skc_tile_color * SKC_RESTRICT const color_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip, - union skc_tile_color const * SKC_RESTRICT const color_wip) -{ - // - // cover.wip.contrib = (1.0 - cover.acc) * cover.wip - // cover.acc = cover.acc + cover.wip.contrib - // - // r.acc = cover.wip.contrib * r.wip + r.acc - // g.acc = cover.wip.contrib * g.wip + g.acc - // b.acc = cover.wip.contrib * b.wip + b.acc - // a.acc = -cover.wip.contrib * a.wip * a.acc - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii]; - - cover_acc->aN.c[ii] += contrib; - - color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); - color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); - color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); - color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); - } -} - -// -// -// - -static -void -skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, - union skc_tile_cover const * SKC_RESTRICT const cover_wip) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover_msk->aN.c[ii] = cover_wip->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover_msk->vN.c[ii] = cover_wip->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, - union skc_tile_cover const * SKC_RESTRICT const cover_acc) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover_msk->aN.c[ii] = cover_acc->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover_msk->vN.c[ii] = cover_acc->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, - union skc_tile_cover const * SKC_RESTRICT const cover_wip) -{ - // - // cover.wip.contrib = (1.0 - cover.acc) * cover.wip - // cover.acc = cover.acc + cover.wip.contrib - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); -} - -// -// -// - -static -void -skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, - union skc_tile_cover const * SKC_RESTRICT const cover_msk) -{ - // - // cover.wip *= cover.msk - // - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover_wip->aN.c[ii] *= cover_msk->aN.c[ii]; -} - -// -// -// - -static -void -skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover->aN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover->vN.c[ii] = 0; - -#endif -} - -static -void -skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover->aN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover->vN.c[ii] = 0; - -#endif -} - -static -void -skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover->aN.c[ii] = 0; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover->vN.c[ii] = 0; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover->aN.c[ii] = 1; - -#else - // - // GEN9 compiler underperforms on this - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; - -#endif -} - -// -// -// - -static -void -skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) -{ -#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - cover->aN.c[ii] = 1 - cover->aN.c[ii]; - -#else - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) - cover->vN.c[ii] = 1 - cover->vN.c[ii]; - -#endif -} - -// -// -// - -static -void -skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) -{ -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - color->aN.rgba[ii].r = 0; - color->aN.rgba[ii].g = 0; - color->aN.rgba[ii].b = 0; - color->aN.rgba[ii].a = 1; - } - -#else - // - // DISABLED ON GEN9 -- probably a compiler bug - // - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.odd = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.odd = 1; -#endif -} - -static -void -skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) -{ -#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - color->aN.rgba[ii].r = 0; - color->aN.rgba[ii].g = 0; - color->aN.rgba[ii].b = 0; - color->aN.rgba[ii].a = 1; - } - -#else - // - // DISABLED ON GEN9 -- probably a compiler bug - // - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.even = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].even.odd = 0; - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - color->vN.rgba[ii].odd.odd = 1; -#endif -} - -// -// -// - -static -bool -skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) -{ - // - // returns true if tile is opaque - // - // various hacks to test for complete tile opacity - // - // note that front-to-back currently has alpha at 0.0f -- this can - // be harmonized to use a traditional alpha if we want to support - // rendering in either direction - // - // hack -- ADD/MAX/OR all alphas together and test for non-zero - // - SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) - for (uint ii=1; ii<SKC_TILE_WIDTH; ii++) - t += color->aN.rgba[ii].a; - -#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) - // - // SIMD - // - return !any(t != ( 0 )); - -#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) - // - // SIMT - scalar per lane - // - return !sub_group_any(t != 0); - -#else - // - // SIMT - vector per lane - // - return !sub_group_any(any(t != ( 0 ))); - -#endif - - // - // TODO: The alternative vector-per-lane implementation below is - // *not* believed to be performant because the terse vector-wide - // test is just hiding a series of comparisons and is likely worse - // than the blind ADD/MAX/OR'ing of all alphas followed by a single - // test. - // -#if 0 - // - // SIMT - vector per lane - // - - // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) - for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) - { - if (sub_group_any(any(color->vN.ba[ii].a != ( 0 )))) - return false; - } - - return true; -#endif -} - -// -// -// - -static -void -skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, - uint * SKC_RESTRICT const cmd_next, - union skc_tile_color * SKC_RESTRICT const color) -{ - // - // acc.r = acc.a * r + acc.r - // acc.g = acc.a * g + acc.g - // acc.b = acc.a * b + acc.b - // - __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; - - *cmd_next += 2; - - SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); - - SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); -} - -// -// -// - -// #define SKC_SURFACE_IS_BUFFER -#ifdef SKC_SURFACE_IS_BUFFER - -static -void -skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, - skc_uint const surface_pitch, - union skc_tile_color const * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // NEW MAJOR OPTIMIZATION: - // - // Rotating and rasterizing the original world transform by -90 - // degrees and then rendering the scene scene by +90 degrees enables - // all the final surface composite to be perfomed in perfectly - // coalesced wide transactions. - // - // For this reason, linear access to the framebuffer is preferred. - // - // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv - // - // NOTE THIS IS TRANSPOSED BY 90 DEGREES - // - // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE - // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. - // - // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS - // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS - // - // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL - // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER - // - uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; - uint const x = skc_ttck_hi_get_x(ttck_hi); - uint const y = skc_ttck_hi_get_y(ttck_hi) ; - uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { - SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 ); - - rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255); - rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; - rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; - - surface[base + ii * pitch] = rgba; - - // printf("%08v2X\n",rgba); - } -} - -#else - -static -void -skc_surface_composite_u8_rgba(__write_only image2d_t surface, - union skc_tile_color const * SKC_RESTRICT const color, - skc_ttck_hi_t const ttck_hi) -{ - // - // NEW MAJOR OPTIMIZATION: - // - // Rotating and rasterizing the original world transform by -90 - // degrees and then rendering the scene scene by +90 degrees enables - // all the final surface composite to be perfomed in perfectly - // coalesced wide transactions. - // - // For this reason, linear access to the framebuffer is preferred. - // - // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv - // - // NOTE THIS IS TRANSPOSED BY 90 DEGREES - // - // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE - // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. - // - // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS - // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS - // - // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL - // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER - // - -#if 1 - int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; - int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { -#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_WRITE(surface, \ - (int2)(x,y+I), \ - color->iN.rgba[ii] A); \ - } - -#else - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_COLOR const rgba = \ - (SKC_RENDER_SURFACE_COLOR) \ - (color->aN.rgba[ii].r C, \ - color->aN.rgba[ii].g C, \ - color->aN.rgba[ii].b C, \ - 1.0); \ - SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ - } - -#endif - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - - x += 1; - } -#else - int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); - int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; - - // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) - for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) - { -#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_WRITE(surface, \ - (int2)(x+I,y+ii), \ - color->iN.rgba[ii] A); \ - } - -#else - -#undef SKC_EXPAND_X -#define SKC_EXPAND_X(I,S,C,P,A) { \ - SKC_RENDER_SURFACE_COLOR const rgba = \ - (SKC_RENDER_SURFACE_COLOR) \ - (color->aN.rgba[ii].r C, \ - color->aN.rgba[ii].g C, \ - color->aN.rgba[ii].b C, \ - 1.0); \ - SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ - } - -#endif - - SKC_RENDER_SCANLINE_VECTOR_EXPAND(); - } - -#endif -} - -#endif - -// -// -// -static -uint const -skc_ttck_lane(uint const ttck_idx) -{ - return ttck_idx & SKC_RENDER_SUBGROUP_MASK; -} - -// -// RENDER KERNEL -// - -__kernel -SKC_RENDER_KERNEL_ATTRIBS -void -skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, - __global struct skc_group_node const * SKC_RESTRICT const groups, - __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename - - __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys - skc_uint const ttck_count, // rename: key_count - - __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets - skc_uint const tile_count, // rename: offset_count - - __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, -#ifdef SKC_SURFACE_IS_BUFFER - __global void * SKC_RESTRICT const surface, -#else - __write_only image2d_t surface, -#endif -#ifdef SKC_SURFACE_IS_BUFFER - skc_uint const surface_pitch, -#endif - uint4 const tile_clip) // rename: clip -{ - // - // Each subgroup is responsible for a tile. No extra subgroups are - // launched. - // - // FIXME -- might be better implemented as a "grid stride loop" if - // Intel GEN really has a local memory "quantum" of 4KB which means - // we would need to launch 4 subgroups per workgroup. - // - // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. - // - - // - // declare tile cover and color registers - // - // this used to be a neat unified struct but the Intel GEN compiler - // wasn't cooperating and spilling to private memory even though all - // registers were indexed by constants - // - union skc_tile_color color_wip; - union skc_tile_color color_acc; - - union skc_tile_cover cover_wip; - union skc_tile_cover cover_acc; - union skc_tile_cover cover_msk; - - // - // which subgroup in the grid is this? - // - // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) - // as a uniform but the alternative calculation used when there are - // multiple subgroups per workgroup is not cooperating and - // driving spillage elsewhere. - // -#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) - skc_uint const ttck_offset_idx = get_group_id(0); -#else - skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); -#endif - - // - // load the starting ttck for this offset and get a bound on the max - // number of keys that might be loaded - // - // these are uniform across all subgroup lanes - // - skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; - - // - // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) - // vector of ttck keys - // -#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - - skc_ttck_t ttck = ttck_keys[ttck_idx]; - -#else - - uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; - uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; - skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] - -#endif - - // - // set up style group/layer state - // - struct skc_styling_group { - union skc_group_range range; - skc_uint depth; - skc_uint id; - } group; - - group.range.lo = 0; - group.range.hi = SKC_UINT_MAX; - group.depth = 0; - group.id = SKC_UINT_MAX; - - // - // start with clear tile opacity, knockout and flag bits - // - // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 - // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 - // - skc_uint flags = 0; - - // - // declare and initialize accumulators - // -#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) - __local union skc_subgroup_smem smem[1]; -#else - __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; - __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); -#endif - -#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - // - // select the initial ttck key - // - skc_ttck_t ttck; -#if 0 - ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN -#else - ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND - ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); -#endif - -#endif - - // - // save the first key so we know what tile we're in - // - skc_ttck_t ttck0 = ttck; - - // - // evaluate the coarse clip as late as possible - // - skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); - - if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) - return; - - skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); - - if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) - return; - -#if 0 - printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); -#endif - - // - // load -> scatter -> flush - // - while (true) - { - // if scattering is disabled then just run through ttck keys - bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; - - // need to clear accumulators before a scatter loop - if (is_scatter_enabled) - { - skc_tile_aa_zero(smem); - } - - do { - // skip scattering? - if (is_scatter_enabled) - { - skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); - - if (skc_ttck_lo_is_prefix(ttck.lo)) { - skc_scatter_ttpb(ttxb_extent,smem,xb_id); - } else { - skc_scatter_ttsb(ttxb_extent,smem,xb_id); - } - } - - // - // any ttck keys left? - // - if (++ttck_idx >= ttck_count) - { - flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; - break; - } - - // - // process next ttck key - // -#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK - // - // SIMD -- read next key - // - ttck = ttck_keys[ttck_idx]; -#else - // - // SIMT -- refresh the ttck_s? - // - uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; - - if (ttck_lane_next == 0) - ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; - - // - // broadcast next key to entire subgroup - // -#if 0 - ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN -#else - ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND - ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); -#endif -#endif - // continue scattering if on same YXL layer - } while (skc_ttck_equal_yxl(ttck0,ttck)); - - // finalize if no longer on same YX tile - if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) - { - // otherwise, unwind the tile styling and exit - flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; - } - - // - // given: new layer id from ttxk key - // - // load [layer id]{ group id, depth } - // - // if within current group's layer range - // - // if at same depth - // - // load and execute cover>[mask>]color>blend commands - // - // else if not at same depth then move deeper - // - // for all groups in group trail from cur depth to new depth - // enter group, saving and initializing regs as necessary - // increment depth and update layer range - // load and execute cover>[mask>]color>blend commands - // - // else not within layer range - // - // exit current group, restoring regs as necessary - // decrement depth and update layer range - // - // - skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi - union skc_layer_node const layer_node_new = layers[layer_id_new]; - - // clear flag that controls group/layer traversal - flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; - - do { - bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; - - // - // is layer a child of the current parent group? - // - uint cmd_next = 0; - - if (!unwind && (layer_node_new.parent == group.id)) - { - // execute this layer's cmds - cmd_next = layer_node_new.cmds; - - // if this is final then configure so groups get unwound, otherwise we're done - flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE); - } - else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) - { - // - // is layer in a child group? - // - union skc_group_parents const gp = groups[layer_node_new.parent].parents; - uint const gn = gp.depth - ++group.depth; - - if (gn == 0) - group.id = layer_node_new.parent; - else - group.id = commands[gp.base + gn - 1].parent; - - // update group layer range - group.range = groups[group.id].range; - - // enter current group - cmd_next = groups[group.id].cmds.enter; - } - else // otherwise, exit this group - { - // enter current group - cmd_next = groups[group.id].cmds.leave; - - // decrement group depth - if (--group.depth == 0) - { - flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; - } - else - { - // get path_base of current group - uint const gnpb = groups[group.id].parents.base; - - // get parent of current group - group.id = commands[gnpb].parent; - - // update group layer range - group.range = groups[group.id].range; - } - } - - // - // execute cmds - // - while (true) - { - union skc_styling_cmd const cmd = commands[cmd_next++]; - - switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) - { - case SKC_STYLING_OPCODE_NOOP: - break; - - case SKC_STYLING_OPCODE_COVER_NONZERO: - skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_EVENODD: - skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACCUMULATE: - skc_tile_cover_accumulate(&cover_acc,&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_MASK: - skc_tile_cover_wip_mask(&cover_wip,&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_WIP_ZERO: - skc_tile_cover_wip_zero(&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACC_ZERO: - skc_tile_cover_acc_zero(&cover_acc); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_ZERO: - skc_tile_cover_msk_zero(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_ONE: - skc_tile_cover_msk_one(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COVER_MASK_INVERT: - skc_tile_cover_msk_invert(&cover_msk); - break; - - case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: - skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); - break; - - case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: - // - // FIXME -- gradients shouldn't be executing so much - // conditional driven code at runtime since we *know* - // the gradient style on the host can just create a - // new styling command to exploit this. - // - // FIXME -- it might be time to try using the GPU's - // sampler on a linear array of half4 vectors -- it - // might outperform the explicit load/lerp routines. - // - // FIXME -- optimizing for vertical gradients (uhhh, - // they're actually horizontal due to the -90 degree - // view transform) is nice but is it worthwhile to - // have this in the kernel? Easy to add it back... - // -#if defined( SKC_ARCH_GEN9 ) - // disable gradients due to exessive spillage -- fix later - cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); -#else - skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); -#endif - break; - - case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: - skc_tile_color_wip_zero(&color_wip); - break; - - case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: - skc_tile_color_acc_zero(&color_acc); - break; - - case SKC_STYLING_OPCODE_BLEND_OVER: - skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_PLUS: - skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_MULTIPLY: - skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: - skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); - break; - - case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: - // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); - break; - - case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: - // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); - break; - - case SKC_STYLING_OPCODE_BACKGROUND_OVER: - skc_tile_background_over(commands,&cmd_next,&color_acc); - break; - - case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: -#ifdef SKC_SURFACE_IS_BUFFER - skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); -#else - skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); -#endif - break; - - case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: - if (skc_tile_color_test_opacity(&color_acc)) - flags |= SKC_TILE_FLAGS_SCATTER_SKIP; - break; - - default: - return; // this is an illegal opcode -- trap and die! - } - - // - // if sign bit is set then this was final command - // - if (cmd.s32 < 0) - break; - } - - // continue as long as tile flush isn't complete - } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); - - // return if was the final flush - if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) - return; - - // update wip ttck_hi - ttck0 = ttck; - } -} - -// -// -// +/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "styling_types.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15
+#endif
+
+//
+// tile state flag bits
+//
+
+typedef enum skc_tile_flags_e {
+
+ // FLUSH
+ SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001,
+ SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002,
+ SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004,
+
+ // OPACITY
+ SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008,
+
+ //
+ // Note: testing for opacity and skipping scattering is on its way
+ // to becoming a much more programmable option because sometimes we
+ // may be compositing/blending from back-to-front and/or be using
+ // group blend rules that ignore opacity.
+ //
+ // The point is that all of these decisions should be encoded in
+ // styling commands and, as much as possible, removed from the final
+ // group/layer styling traversal render loop.
+ //
+
+} skc_tile_flags_e;
+
+//
+// COVER -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_cover
+{
+ struct {
+ SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH];
+ } aN;
+
+#ifdef SKC_RENDER_TILE_COVER_VECTOR
+ struct {
+ SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
+ } vN;
+#endif
+};
+
+//
+// COLOR -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_color
+{
+ union {
+ struct {
+ SKC_RENDER_TILE_COLOR r;
+ SKC_RENDER_TILE_COLOR g;
+ SKC_RENDER_TILE_COLOR b;
+ SKC_RENDER_TILE_COLOR a;
+ } rgba[SKC_TILE_WIDTH];
+ } aN;
+
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+ union {
+ SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
+ } iN;
+#endif
+
+#ifdef SKC_RENDER_TILE_COLOR_VECTOR
+ union {
+ SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
+ } vN;
+#endif
+
+ struct {
+ union {
+ struct {
+ SKC_RENDER_TILE_COLOR r;
+ SKC_RENDER_TILE_COLOR g;
+ };
+ SKC_RENDER_GRADIENT_FLOAT distance;
+ };
+ union {
+ struct {
+ SKC_RENDER_TILE_COLOR b;
+ SKC_RENDER_TILE_COLOR a;
+ };
+ SKC_RENDER_GRADIENT_FLOAT stoplerp;
+ };
+ } grad[SKC_TILE_WIDTH];
+};
+
+//
+// SHARED MEMORY STATE
+//
+
+#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
+
+#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
+#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
+
+//
+//
+//
+
+union skc_subgroup_smem
+{
+ //
+ // The tiles are stored in column-major / height-major order
+ //
+ // The final column is a guard column that is OK to write to but
+ // will never be read. It simplifies the TTSB scatter but could be
+ // predicated if SMEM is really at a premium.
+ //
+#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
+ struct {
+ SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+ } atomic;
+#endif
+
+ struct {
+ int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+ } aN;
+
+ struct { // assumption is that height = subgroup
+ SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
+ } vN;
+
+ struct { // assumption is that height = subgroup
+ SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
+ } wide;
+
+ union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
+
+ half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
+
+#if 0
+ //
+ // SPILL TO GMEM
+ //
+#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
+ struct {
+
+#if (SKC_REGS_COLOR_S > 0)
+ union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+#if (SKC_REGS_COVER_S > 0)
+ union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+ } regs;
+#endif
+ //
+ //
+ //
+#endif
+};
+
+//
+//
+//
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+
+#define skc_subgroup_lane() 0
+
+#else
+
+#define skc_subgroup_lane() get_sub_group_local_id()
+
+#endif
+
+//
+//
+//
+
+typedef skc_uint skc_ttsk_lo_t;
+typedef skc_uint skc_ttsk_hi_t;
+
+typedef skc_uint skc_ttpk_lo_t;
+typedef skc_uint skc_ttpk_hi_t;
+
+typedef skc_uint skc_ttxk_lo_t;
+typedef skc_uint skc_ttxk_hi_t;
+
+typedef skc_uint skc_ttck_lo_t;
+typedef skc_uint skc_ttck_hi_t;
+
+typedef skc_uint2 skc_ttck_t;
+
+typedef skc_int skc_ttxb_t;
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 18 | 7 | 7 |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 15 | 9 | 8 |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 27 | 1 | 1 | 18 | 9 | 8 |
+//
+
+static
+skc_uint
+skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
+{
+ return a & SKC_TTCK_LO_MASK_ID;
+}
+
+static
+skc_layer_id
+skc_ttck_get_layer(skc_ttck_t const a)
+{
+ //
+ // FIXME -- a union with a ulong and a shift down and mask is
+ // probably faster on some architectures
+ //
+ skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+ skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
+
+ return lo | hi;
+}
+
+static
+skc_uint
+skc_ttck_hi_get_x(skc_ttck_hi_t const a)
+{
+ return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
+}
+
+static
+skc_uint
+skc_ttck_hi_get_y(skc_ttck_hi_t const a)
+{
+ return a >> SKC_TTCK_HI_OFFSET_Y;
+}
+
+static
+skc_bool
+skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
+{
+ skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+ skc_uint const hi = (a.hi ^ b.hi);
+
+ return (lo | hi) == 0;
+}
+
+static
+skc_bool
+skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
+{
+ return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
+}
+
+static
+skc_bool
+skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
+{
+ return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
+}
+
+//
+// TILE TRACE SUBPIXEL
+//
+// The subpixels are encoded with either absolute tile coordinates
+// (32-bits) or packed in delta-encoded form form.
+//
+// For 32-bit subpixel packing of a 32x32 tile:
+//
+// A tile X is encoded as:
+//
+// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
+//
+// SX : 6 : unsigned subpixel span from min to max x with range
+// [0,32]. The original direction is not captured. Would
+// be nice to capture dx but not necessary right now but
+// could be in the future. <--- SPARE VALUES AVAILABLE
+//
+// A tile Y is encoded as:
+//
+// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
+//
+// DY : 6 : signed subpixel delta y1-y0. The range of delta is
+// [-32,32] but horizontal lines are not encoded so [1,32]
+// is mapped to [0,31]. The resulting range [-32,31] fits
+// in 6 bits.
+//
+// TTS:
+//
+// 0 31
+// | TX | SX | TY | DY |
+// +-----+------+-----+------+
+// | 10 | 6 | 10 | 6 |
+//
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
+{
+ //
+ // extract the whole pixel y coordinate
+ //
+ return SKC_BFE(a,
+ SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2,
+ SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
+{
+ //
+ // get the linear array tile index of the pixel
+ //
+ return (((a & SKC_TTS_MASK_TX_PIXEL)
+
+#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
+ >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
+#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
+ << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2)
+#endif
+
+ ) | skc_tts_get_ty_pixel_v(a));
+}
+
+#if 0
+static
+skc_ttx_v_s32_t
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+ skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
+
+ return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
+}
+#else
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+ SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
+
+ return dy - (~a >> 31);
+}
+#endif
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
+{
+ return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
+{
+ return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
+{
+ //
+ // SIMD / CPU
+ //
+ // &
+ //
+ // SIMT / GPU
+ //
+ // Note that atomic_init() is likely implemented as a simple
+ // assignment so there is no identifiable performance difference on
+ // current targets.
+ //
+ // If such an architecture appears in the future then we'll probably
+ // still want to implement this zero'ing operation as below but
+ // follow with an appropriate fence that occurs before any scatter
+ // operations.
+ //
+ // The baroque expansion below improves performance on Intel GEN by,
+ // presumably, achieving the 64-byte per clock SLM write as well as
+ // minimizing the overall number of SEND() block initializations and
+ // launches.
+ //
+ // Intel GENx has a documented 64 byte per cycle SLM write limit.
+ // So having each lane in an 8 lane subgroup zero-write 8 bytes is
+ // probably a safe bet (Later: benchmarking backs this up!).
+ //
+ // Note there is no reason at this time to unroll this loop.
+ //
+ for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
+ smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// Note this is going to be vectorizable on most architectures.
+//
+// The return of the key translation feature might complicate things.
+//
+
+static
+void
+skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
+ __local union skc_subgroup_smem * SKC_RESTRICT const smem,
+ skc_block_id_t const pb_id)
+{
+ skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+
+#if ( SKC_TILE_RATIO == 1 )
+
+ SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+ SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+
+ //
+ // Note there is no need to use an atomic for this operation on the
+ // current group of target platforms... but this may change if
+ // atomic ops truly go through a different path.
+ //
+ // As noted above, this direct increment is probably faster and can
+ // always be followed by a fence.
+ //
+ // Furthermore, note that the key sorting orders all ttck keys
+ // before ttpk keys.
+ //
+
+ //
+ // FIXME -- if the SMEM store is wider than bank word count then we
+ // might want to odd-even interleave the TTP values if the target
+ // device can't handle 64-bit stores
+ //
+
+ //
+ // skipping per-key translation for now
+ //
+ smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
+}
+
+//
+// Note that skc_scatter_ttsb is *not* vectorizable unless the
+// architecture supports a "scatter-add" capability. All relevant
+// GPUs support atomic add on shared/local memory and thus support
+// scatter-add.
+//
+
+static
+void
+skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
+ __local union skc_subgroup_smem * SKC_RESTRICT const smem,
+ skc_block_id_t const sb_id)
+{
+ skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+ SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset];
+
+ //
+ // Skipping per-key translation for now
+ //
+
+ // Index into tile
+ //
+ // The tiles are stored in column-major / height-major order
+ //
+ // The final column is a guard column that is OK to write to but
+ // will never be read. It simplifies the TTSB scatter but could be
+ // predicated if SMEM is really at a premium.
+ //
+
+ SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
+
+#if 0
+ if (tts_v != SKC_TTS_INVALID)
+ printf("(%08X) = %u\n",tts_v,xy_idx);
+#endif
+
+ //
+ // adjust subpixel range to max y
+ //
+ // range is stored as [-32,31] and when read [0,31] is mapped to
+ // [1,32] because a dy of 0 is not possible.
+ //
+ // more succinctly: if dy >= 0 then ++dy
+ //
+ SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v);
+
+ //
+ // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
+ //
+
+ // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
+ SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
+
+ // Calculate left and right coverage contribution trapezoids
+ SKC_RENDER_TTS_V_BITFIELD const left = dy * widths;
+ SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
+
+ //
+ // Accumulate altitudes and areas
+ //
+ // Optimization: if the device supports an CPU/SIMD vector-add or
+ // GPU/SIMT scatter-add atomic int2 add operation then placing the
+ // ALT and AREA values side-by-side would halve the number of
+ // additions.
+ //
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+ //
+ // CPU/SIMD
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (tts_v C != SKC_TTS_INVALID) { \
+ smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \
+ smem->aN.area[ xy_idx C] += right C; \
+ }
+
+#else
+ //
+ // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) \
+ if (tts_v C != SKC_TTS_INVALID) { \
+ SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \
+ SKC_TILE_HEIGHT + xy_idx C, \
+ left C); \
+ SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
+ right C); \
+ }
+#endif
+
+ SKC_RENDER_TTSB_EXPAND();
+}
+
+//
+// Note that 2048.0 can be represented exactly with fp16... fortuitous!
+//
+
+#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
+#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA)
+#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1)
+#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+ union skc_tile_cover * SKC_RESTRICT const cover,
+ union skc_tile_color * SKC_RESTRICT const color)
+{
+ SKC_RENDER_ACC_COVER_INT area = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ area += smem->vN.area[ii][skc_subgroup_lane()];
+ SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+ SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
+
+ cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
+ }
+}
+
+static
+void
+skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+ union skc_tile_cover * SKC_RESTRICT const cover,
+ union skc_tile_color * SKC_RESTRICT const color)
+{
+ SKC_RENDER_ACC_COVER_INT area = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ area += smem->vN.area[ii][skc_subgroup_lane()];
+ SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+ SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
+
+ cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+ uint * SKC_RESTRICT const cmd_next,
+ union skc_tile_color * SKC_RESTRICT const color)
+{
+ //
+ // rgba = solid fill
+ //
+ __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+ *cmd_next += 2;
+
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
+
+ SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].r = rg.lo;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].g = rg.hi;
+
+ SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].b = ba.lo;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].a = ba.hi;
+
+#else
+
+ SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+ SKC_RENDER_TILE_COLOR const r = rg.lo;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
+
+ SKC_RENDER_TILE_COLOR const g = rg.hi;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
+
+ SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+ SKC_RENDER_TILE_COLOR const b = ba.lo;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
+
+ SKC_RENDER_TILE_COLOR const a = ba.hi;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
+
+#endif
+}
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+// t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+// a + (b - a) * t
+//
+// But this may be a native instruction on some devices. For example,
+// on GEN9 there is an LRP "linear interoplation" function but it
+// doesn't appear to support half floats.
+//
+
+#if 1
+#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t) mix(a,b,t)
+#endif
+
+//
+// CPUs have a mock local address space so copying the gradient header
+// is probably not useful. Just read directly from global.
+//
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+#define SKC_RENDER_GRADIENT_SPACE __local
+#else
+#define SKC_RENDER_GRADIENT_SPACE __global
+#endif
+
+//
+// gradient is non-vertical
+//
+// removed the vertical (actually, horizontal) special case
+//
+
+static
+void
+skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+ __global union skc_styling_cmd const * SKC_RESTRICT const commands,
+ uint * SKC_RESTRICT const cmd_next,
+ union skc_tile_color * SKC_RESTRICT const color,
+ skc_ttck_hi_t const ttck_hi)
+{
+ //
+ // Where is this tile?
+ //
+ // Note that the gradient is being sampled from pixel centers.
+ //
+ SKC_RENDER_GRADIENT_FLOAT const y =
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
+ (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
+ (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
+
+ float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
+
+ //
+ // Get starting numerator and denominator
+ //
+ // Note: if gh[0].dx is exactly 0.0f then this is a vertical
+ // gradient and can be handled by a special opcode.
+ //
+ // Note: the mad() ordering is slightly different than the original
+ // CUDA implementation.
+ //
+ union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) };
+
+ *cmd_next += 4;
+
+ float const gv_x_dot = mad(x,gv.dx,gv.p0);
+ SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
+
+ //
+ // Where are columns along gradient vector?
+ //
+ // TODO: Note that the gv_denom isn't multiplied through.
+ //
+ // Please doublecheck this... but I recall that in certain cases
+ // this wipes out some precision and results in minor but noticeable
+ // gradient artifacts.
+ //
+ // All arguments are scalars except gv_numer so a simpler
+ // evaluation might save some flops.
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
+
+ //
+ // is gradient non-repeating, repeating or reflecting?
+ //
+ switch (commands[(*cmd_next)++].u32)
+ {
+ case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
+ break;
+
+ case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->grad[ii].distance -= floor(color->grad[ii].distance);
+ break;
+
+ default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
+ //
+ // OPTIMIZATION: Can this be done in fewer than ~4 ops?
+ //
+ // Note: OpenCL "rint()" is round-to-nearest-even integer!
+ //
+ // Note: the floor() "round to -inf" op is implemented in the
+ // GEN op 'FRC' so probably don't use trunc() when floor will
+ // suffice.
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
+ color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
+ }
+ }
+
+ //
+ // initialize "stoplerp" for all columns
+ //
+ uint const slope_count = commands[(*cmd_next)++].u32;
+ uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME
+
+ {
+ float const slope = commands[(*cmd_next)++].f32;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->grad[ii].stoplerp = color->grad[ii].distance * slope;
+ }
+
+ //
+ // compute stoplerp for remaining stops
+ //
+ for (int jj=1; jj<slope_count; jj++)
+ {
+ float const floor = (float)jj;
+ float const slope = commands[(*cmd_next)++].f32;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
+ }
+
+ //
+ // copy gradient colors to local memory
+ //
+ uint const gd_n = slope_count + 1;
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+ //
+ // copy entire gradient descriptor to local memory
+ //
+ for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
+ smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
+
+ __local half const * const SKC_RESTRICT gc = smem->gc + 0;
+#else
+ //
+ // prefetch entire gradient header
+ //
+ // no noticeable impact on performance
+ //
+ // prefetch(&commands[*cmd_next].u32,gh_words);
+ //
+ __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
+#endif
+
+ //
+ // adjust cmd_next so that V1 structure is consumed -- FIXME
+ //
+ *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
+
+ //
+ // lerp between color pair stops
+ //
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ //
+ // Finally, we have the gradient stop index and the color stop
+ // pair lerp fraction
+ //
+ // Note that if these are vector values then a gather operation
+ // must occur -- there may be platforms (AVX-512?) that can
+ // perform an explicit gather on a vector type but it's not
+ // really expressible in OpenCL except implicitly with a
+ // workgroup of work items.
+ //
+ // ***********************
+ //
+ // FIXME -- USE HERB'S SINGLE FMA LERP
+ //
+ // ***********************
+ //
+ SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
+ SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
+
+ {
+ SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
+ lo C = cc.lo; \
+ hi C = cc.hi; \
+ }
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+ color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
+ }
+
+ //
+ //
+ //
+ {
+ SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
+ lo C = cc.lo; \
+ hi C = cc.hi; \
+ }
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+ color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
+ }
+
+ //
+ //
+ //
+ {
+ SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
+ lo C = cc.lo; \
+ hi C = cc.hi; \
+ }
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+ color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
+ }
+
+ //
+ //
+ //
+ {
+ SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
+ lo C = cc.lo; \
+ hi C = cc.hi; \
+ }
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+ color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
+ }
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+ union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+ //
+ // fralunco = cover.wip * acc.a
+ //
+ // acc.r = fralunco * wip.r + acc.r
+ // acc.g = fralunco * wip.g + acc.g
+ // acc.b = fralunco * wip.b + acc.b
+ // acc.a = -fralunco * wip.a + acc.a
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
+
+ color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+ color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+ color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+ color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+ union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+ //
+ // cover_min = min(cover.wip,a.acc)
+ //
+ // r.acc = cover_min * r.wip + r.acc
+ // g.acc = cover_min * g.wip + g.acc
+ // b.acc = cover_min * b.wip + b.acc
+ // a.acc = -cover_min * a.wip + a.acc
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
+
+ color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+ color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+ color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+ color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+ union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+ //
+ // r.acc = (cover.wip * r.wip) * r.acc
+ // g.acc = (cover.wip * g.wip) * g.acc
+ // b.acc = (cover.wip * b.wip) * b.acc
+ // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
+ color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
+ color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
+ color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc,
+ union skc_tile_color * SKC_RESTRICT const color_acc,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+ union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+ //
+ // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+ // cover.acc = cover.acc + cover.wip.contrib
+ //
+ // r.acc = cover.wip.contrib * r.wip + r.acc
+ // g.acc = cover.wip.contrib * g.wip + g.acc
+ // b.acc = cover.wip.contrib * b.wip + b.acc
+ // a.acc = -cover.wip.contrib * a.wip * a.acc
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
+
+ cover_acc->aN.c[ii] += contrib;
+
+ color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+ color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+ color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+ color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
+
+#else
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk,
+ union skc_tile_cover const * SKC_RESTRICT const cover_acc)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
+
+#else
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc,
+ union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+ //
+ // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+ // cover.acc = cover.acc + cover.wip.contrib
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip,
+ union skc_tile_cover const * SKC_RESTRICT const cover_msk)
+{
+ //
+ // cover.wip *= cover.msk
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover->aN.c[ii] = 0;
+
+#else
+ //
+ // GEN9 compiler underperforms on this
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover->aN.c[ii] = 0;
+
+#else
+ //
+ // GEN9 compiler underperforms on this
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover->aN.c[ii] = 0;
+
+#else
+ //
+ // GEN9 compiler underperforms on this
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover->vN.c[ii] = 0;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover->aN.c[ii] = 1;
+
+#else
+ //
+ // GEN9 compiler underperforms on this
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ cover->aN.c[ii] = 1 - cover->aN.c[ii];
+
+#else
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+ cover->vN.c[ii] = 1 - cover->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ color->aN.rgba[ii].r = 0;
+ color->aN.rgba[ii].g = 0;
+ color->aN.rgba[ii].b = 0;
+ color->aN.rgba[ii].a = 1;
+ }
+
+#else
+ //
+ // DISABLED ON GEN9 -- probably a compiler bug
+ //
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.even = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.even = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.odd = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.odd = 1;
+#endif
+}
+
+static
+void
+skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ color->aN.rgba[ii].r = 0;
+ color->aN.rgba[ii].g = 0;
+ color->aN.rgba[ii].b = 0;
+ color->aN.rgba[ii].a = 1;
+ }
+
+#else
+ //
+ // DISABLED ON GEN9 -- probably a compiler bug
+ //
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.even = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.even = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].even.odd = 0;
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ color->vN.rgba[ii].odd.odd = 1;
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
+{
+ //
+ // returns true if tile is opaque
+ //
+ // various hacks to test for complete tile opacity
+ //
+ // note that front-to-back currently has alpha at 0.0f -- this can
+ // be harmonized to use a traditional alpha if we want to support
+ // rendering in either direction
+ //
+ // hack -- ADD/MAX/OR all alphas together and test for non-zero
+ //
+ SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+ for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
+ t += color->aN.rgba[ii].a;
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+ //
+ // SIMD
+ //
+ return !any(t != ( 0 ));
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+ //
+ // SIMT - scalar per lane
+ //
+ return !sub_group_any(t != 0);
+
+#else
+ //
+ // SIMT - vector per lane
+ //
+ return !sub_group_any(any(t != ( 0 )));
+
+#endif
+
+ //
+ // TODO: The alternative vector-per-lane implementation below is
+ // *not* believed to be performant because the terse vector-wide
+ // test is just hiding a series of comparisons and is likely worse
+ // than the blind ADD/MAX/OR'ing of all alphas followed by a single
+ // test.
+ //
+#if 0
+ //
+ // SIMT - vector per lane
+ //
+
+ // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
+ for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+ {
+ if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
+ return false;
+ }
+
+ return true;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+ uint * SKC_RESTRICT const cmd_next,
+ union skc_tile_color * SKC_RESTRICT const color)
+{
+ //
+ // acc.r = acc.a * r + acc.r
+ // acc.g = acc.a * g + acc.g
+ // acc.b = acc.a * b + acc.b
+ //
+ __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+ *cmd_next += 2;
+
+ SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
+
+ SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
+}
+
+//
+//
+//
+
+// #define SKC_SURFACE_IS_BUFFER
+#ifdef SKC_SURFACE_IS_BUFFER
+
+static
+void
+skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
+ skc_uint const surface_pitch,
+ union skc_tile_color const * SKC_RESTRICT const color,
+ skc_ttck_hi_t const ttck_hi)
+{
+ //
+ // NEW MAJOR OPTIMIZATION:
+ //
+ // Rotating and rasterizing the original world transform by -90
+ // degrees and then rendering the scene scene by +90 degrees enables
+ // all the final surface composite to be perfomed in perfectly
+ // coalesced wide transactions.
+ //
+ // For this reason, linear access to the framebuffer is preferred.
+ //
+ // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+ //
+ // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+ //
+ // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+ // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+ //
+ // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+ // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+ //
+ // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+ // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+ //
+ uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
+ uint const x = skc_ttck_hi_get_x(ttck_hi);
+ uint const y = skc_ttck_hi_get_y(ttck_hi) ;
+ uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+ SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
+
+ rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
+ rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
+ rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
+
+ surface[base + ii * pitch] = rgba;
+
+ // printf("%08v2X\n",rgba);
+ }
+}
+
+#else
+
+static
+void
+skc_surface_composite_u8_rgba(__write_only image2d_t surface,
+ union skc_tile_color const * SKC_RESTRICT const color,
+ skc_ttck_hi_t const ttck_hi)
+{
+ //
+ // NEW MAJOR OPTIMIZATION:
+ //
+ // Rotating and rasterizing the original world transform by -90
+ // degrees and then rendering the scene scene by +90 degrees enables
+ // all the final surface composite to be perfomed in perfectly
+ // coalesced wide transactions.
+ //
+ // For this reason, linear access to the framebuffer is preferred.
+ //
+ // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+ //
+ // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+ //
+ // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+ // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+ //
+ // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+ // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+ //
+ // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+ // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+ //
+
+#if 1
+ int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+ int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_SURFACE_WRITE(surface, \
+ (int2)(x,y+I), \
+ color->iN.rgba[ii] A); \
+ }
+
+#else
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_SURFACE_COLOR const rgba = \
+ (SKC_RENDER_SURFACE_COLOR) \
+ (color->aN.rgba[ii].r C, \
+ color->aN.rgba[ii].g C, \
+ color->aN.rgba[ii].b C, \
+ 1.0); \
+ SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \
+ }
+
+#endif
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+ x += 1;
+ }
+#else
+ int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+ int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+
+ // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+ for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+ {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_SURFACE_WRITE(surface, \
+ (int2)(x+I,y+ii), \
+ color->iN.rgba[ii] A); \
+ }
+
+#else
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) { \
+ SKC_RENDER_SURFACE_COLOR const rgba = \
+ (SKC_RENDER_SURFACE_COLOR) \
+ (color->aN.rgba[ii].r C, \
+ color->aN.rgba[ii].g C, \
+ color->aN.rgba[ii].b C, \
+ 1.0); \
+ SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \
+ }
+
+#endif
+
+ SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+ }
+
+#endif
+}
+
+#endif
+
+//
+//
+//
+static
+uint const
+skc_ttck_lane(uint const ttck_idx)
+{
+ return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+}
+
+//
+// RENDER KERNEL
+//
+
+__kernel
+SKC_RENDER_KERNEL_ATTRIBS
+void
+skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers,
+ __global struct skc_group_node const * SKC_RESTRICT const groups,
+ __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename
+
+ __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys
+ skc_uint const ttck_count, // rename: key_count
+
+ __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets
+ skc_uint const tile_count, // rename: offset_count
+
+ __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
+#ifdef SKC_SURFACE_IS_BUFFER
+ __global void * SKC_RESTRICT const surface,
+#else
+ __write_only image2d_t surface,
+#endif
+#ifdef SKC_SURFACE_IS_BUFFER
+ skc_uint const surface_pitch,
+#endif
+ uint4 const tile_clip) // rename: clip
+{
+ //
+ // Each subgroup is responsible for a tile. No extra subgroups are
+ // launched.
+ //
+ // FIXME -- might be better implemented as a "grid stride loop" if
+ // Intel GEN really has a local memory "quantum" of 4KB which means
+ // we would need to launch 4 subgroups per workgroup.
+ //
+ // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
+ //
+
+ //
+ // declare tile cover and color registers
+ //
+ // this used to be a neat unified struct but the Intel GEN compiler
+ // wasn't cooperating and spilling to private memory even though all
+ // registers were indexed by constants
+ //
+ union skc_tile_color color_wip;
+ union skc_tile_color color_acc;
+
+ union skc_tile_cover cover_wip;
+ union skc_tile_cover cover_acc;
+ union skc_tile_cover cover_msk;
+
+ //
+ // which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
+ // as a uniform but the alternative calculation used when there are
+ // multiple subgroups per workgroup is not cooperating and
+ // driving spillage elsewhere.
+ //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+ skc_uint const ttck_offset_idx = get_group_id(0);
+#else
+ skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ //
+ // load the starting ttck for this offset and get a bound on the max
+ // number of keys that might be loaded
+ //
+ // these are uniform across all subgroup lanes
+ //
+ skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
+
+ //
+ // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
+ // vector of ttck keys
+ //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+
+ skc_ttck_t ttck = ttck_keys[ttck_idx];
+
+#else
+
+ uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
+ uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+ skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
+
+#endif
+
+ //
+ // set up style group/layer state
+ //
+ struct skc_styling_group {
+ union skc_group_range range;
+ skc_uint depth;
+ skc_uint id;
+ } group;
+
+ group.range.lo = 0;
+ group.range.hi = SKC_UINT_MAX;
+ group.depth = 0;
+ group.id = SKC_UINT_MAX;
+
+ //
+ // start with clear tile opacity, knockout and flag bits
+ //
+ // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+ // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+ //
+ skc_uint flags = 0;
+
+ //
+ // declare and initialize accumulators
+ //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+ __local union skc_subgroup_smem smem[1];
+#else
+ __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
+ __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
+#endif
+
+#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+ //
+ // select the initial ttck key
+ //
+ skc_ttck_t ttck;
+#if 0
+ ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+ ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
+ ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
+#endif
+
+#endif
+
+ //
+ // save the first key so we know what tile we're in
+ //
+ skc_ttck_t ttck0 = ttck;
+
+ //
+ // evaluate the coarse clip as late as possible
+ //
+ skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
+
+ if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
+ return;
+
+ skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
+
+ if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
+ return;
+
+#if 0
+ printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
+#endif
+
+ //
+ // load -> scatter -> flush
+ //
+ while (true)
+ {
+ // if scattering is disabled then just run through ttck keys
+ bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
+
+ // need to clear accumulators before a scatter loop
+ if (is_scatter_enabled)
+ {
+ skc_tile_aa_zero(smem);
+ }
+
+ do {
+ // skip scattering?
+ if (is_scatter_enabled)
+ {
+ skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
+
+ if (skc_ttck_lo_is_prefix(ttck.lo)) {
+ skc_scatter_ttpb(ttxb_extent,smem,xb_id);
+ } else {
+ skc_scatter_ttsb(ttxb_extent,smem,xb_id);
+ }
+ }
+
+ //
+ // any ttck keys left?
+ //
+ if (++ttck_idx >= ttck_count)
+ {
+ flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+ break;
+ }
+
+ //
+ // process next ttck key
+ //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+ //
+ // SIMD -- read next key
+ //
+ ttck = ttck_keys[ttck_idx];
+#else
+ //
+ // SIMT -- refresh the ttck_s?
+ //
+ uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+
+ if (ttck_lane_next == 0)
+ ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
+
+ //
+ // broadcast next key to entire subgroup
+ //
+#if 0
+ ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+ ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
+ ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
+#endif
+#endif
+ // continue scattering if on same YXL layer
+ } while (skc_ttck_equal_yxl(ttck0,ttck));
+
+ // finalize if no longer on same YX tile
+ if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
+ {
+ // otherwise, unwind the tile styling and exit
+ flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+ }
+
+ //
+ // given: new layer id from ttxk key
+ //
+ // load [layer id]{ group id, depth }
+ //
+ // if within current group's layer range
+ //
+ // if at same depth
+ //
+ // load and execute cover>[mask>]color>blend commands
+ //
+ // else if not at same depth then move deeper
+ //
+ // for all groups in group trail from cur depth to new depth
+ // enter group, saving and initializing regs as necessary
+ // increment depth and update layer range
+ // load and execute cover>[mask>]color>blend commands
+ //
+ // else not within layer range
+ //
+ // exit current group, restoring regs as necessary
+ // decrement depth and update layer range
+ //
+ //
+ skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
+ union skc_layer_node const layer_node_new = layers[layer_id_new];
+
+ // clear flag that controls group/layer traversal
+ flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
+
+ do {
+ bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
+
+ //
+ // is layer a child of the current parent group?
+ //
+ uint cmd_next = 0;
+
+ if (!unwind && (layer_node_new.parent == group.id))
+ {
+ // execute this layer's cmds
+ cmd_next = layer_node_new.cmds;
+
+ // if this is final then configure so groups get unwound, otherwise we're done
+ flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
+ }
+ else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
+ {
+ //
+ // is layer in a child group?
+ //
+ union skc_group_parents const gp = groups[layer_node_new.parent].parents;
+ uint const gn = gp.depth - ++group.depth;
+
+ if (gn == 0)
+ group.id = layer_node_new.parent;
+ else
+ group.id = commands[gp.base + gn - 1].parent;
+
+ // update group layer range
+ group.range = groups[group.id].range;
+
+ // enter current group
+ cmd_next = groups[group.id].cmds.enter;
+ }
+ else // otherwise, exit this group
+ {
+ // enter current group
+ cmd_next = groups[group.id].cmds.leave;
+
+ // decrement group depth
+ if (--group.depth == 0)
+ {
+ flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
+ }
+ else
+ {
+ // get path_base of current group
+ uint const gnpb = groups[group.id].parents.base;
+
+ // get parent of current group
+ group.id = commands[gnpb].parent;
+
+ // update group layer range
+ group.range = groups[group.id].range;
+ }
+ }
+
+ //
+ // execute cmds
+ //
+ while (true)
+ {
+ union skc_styling_cmd const cmd = commands[cmd_next++];
+
+ switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
+ {
+ case SKC_STYLING_OPCODE_NOOP:
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_NONZERO:
+ skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_EVENODD:
+ skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
+ skc_tile_cover_accumulate(&cover_acc,&cover_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_MASK:
+ skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
+ skc_tile_cover_wip_zero(&cover_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
+ skc_tile_cover_acc_zero(&cover_acc);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
+ skc_tile_cover_msk_zero(&cover_msk);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_MASK_ONE:
+ skc_tile_cover_msk_one(&cover_msk);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
+ skc_tile_cover_msk_invert(&cover_msk);
+ break;
+
+ case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
+ skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
+ //
+ // FIXME -- gradients shouldn't be executing so much
+ // conditional driven code at runtime since we *know*
+ // the gradient style on the host can just create a
+ // new styling command to exploit this.
+ //
+ // FIXME -- it might be time to try using the GPU's
+ // sampler on a linear array of half4 vectors -- it
+ // might outperform the explicit load/lerp routines.
+ //
+ // FIXME -- optimizing for vertical gradients (uhhh,
+ // they're actually horizontal due to the -90 degree
+ // view transform) is nice but is it worthwhile to
+ // have this in the kernel? Easy to add it back...
+ //
+#if defined( SKC_ARCH_GEN9 )
+ // disable gradients due to exessive spillage -- fix later
+ cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
+#else
+ skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
+#endif
+ break;
+
+ case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
+ skc_tile_color_wip_zero(&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
+ skc_tile_color_acc_zero(&color_acc);
+ break;
+
+ case SKC_STYLING_OPCODE_BLEND_OVER:
+ skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_BLEND_PLUS:
+ skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
+ skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
+ skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
+ // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
+ break;
+
+ case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
+ // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
+ break;
+
+ case SKC_STYLING_OPCODE_BACKGROUND_OVER:
+ skc_tile_background_over(commands,&cmd_next,&color_acc);
+ break;
+
+ case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
+#ifdef SKC_SURFACE_IS_BUFFER
+ skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
+#else
+ skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi);
+#endif
+ break;
+
+ case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
+ if (skc_tile_color_test_opacity(&color_acc))
+ flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
+ break;
+
+ default:
+ return; // this is an illegal opcode -- trap and die!
+ }
+
+ //
+ // if sign bit is set then this was final command
+ //
+ if (cmd.s32 < 0)
+ break;
+ }
+
+ // continue as long as tile flush isn't complete
+ } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
+
+ // return if was the final flush
+ if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
+ return;
+
+ // update wip ttck_hi
+ ttck0 = ttck;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl index 6ac068bee6..378d51d8d7 100644 --- a/src/compute/skc/segment_ttck.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl @@ -1,131 +1,130 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE -// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS -// KERNEL WILL NEED TO BE UPDATED -// - -#include <hs/cl/gen9/hs_cl_macros.h> - -#include "atomic_cl.h" -#include "tile.h" - -// -// -// - -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) - -// -// -// - -#define SKC_YX_NEQ(row,prev) \ - (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) - -// -// -// - -__kernel -__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) -void -skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, - __global uint * SKC_RESTRICT const indices, - __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) -{ - uint const global_id = get_global_id(0); - uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; - uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; - - // - // LOAD ALL THE ROWS - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; - - HS_SLAB_ROWS(); - - // - // LOAD LAST REGISTER FROM COLUMN TO LEFT - // - uint diffs = 0; - uint2 r0 = r1; - - if (gmem_base > 0) { - // if this is the first key in any slab but the first then it - // broadcast loads the last key in previous slab - r0.hi = as_uint2(vout[gmem_base - 1]).hi; - } else if (get_sub_group_local_id() == 0) { - // if this is the first lane in the first slab - diffs = 1; - } - - // now shuffle in the last key from the column to the left - r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); - - // - // FIND ALL DIFFERENCES IN SLAB - // - uint valid = 0; - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - valid |= ((r##row != SKC_ULONG_MAX) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - diffs |= (SKC_YX_NEQ(row,prev) << prev); - - HS_SLAB_ROWS(); - - // - // SUM UP THE DIFFERENCES - // - uint const valid_diffs = valid & diffs; - uint const count = popcount(valid_diffs); - uint const inclusive = sub_group_scan_inclusive_add(count); - uint const exclusive = inclusive - count; - - // - // RESERVE SPACE IN THE INDICES ARRAY - // - uint next = 0; - - if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) - next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset - - // distribute base across subgroup - next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); - - // - // STORE THE INDICES - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (valid_diffs & (1 << prev)) \ - indices[next++] = lane_idx + prev; - - HS_SLAB_ROWS(); - - // - // TRANSPOSE THE SLAB AND STORE IT - // - HS_TRANSPOSE_SLAB(); -} - -// -// -// +/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
+
+//
+//
+//
+
+#define SKC_YX_NEQ(row,prev) \
+ (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+ __global uint * SKC_RESTRICT const indices,
+ __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
+{
+ uint const global_id = get_global_id(0);
+ uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+ uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
+ uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+ //
+ // LOAD ALL THE ROWS
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+ HS_SLAB_ROWS();
+
+ //
+ // LOAD LAST REGISTER FROM COLUMN TO LEFT
+ //
+ uint diffs = 0;
+ uint2 r0 = r1;
+
+ if (gmem_base > 0) {
+ // if this is the first key in any slab but the first then it
+ // broadcast loads the last key in previous slab
+ r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+ } else if (get_sub_group_local_id() == 0) {
+ // if this is the first lane in the first slab
+ diffs = 1;
+ }
+
+ // now shuffle in the last key from the column to the left
+ r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+ //
+ // FIND ALL DIFFERENCES IN SLAB
+ //
+ uint valid = 0;
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ diffs |= (SKC_YX_NEQ(row,prev) << prev);
+
+ HS_SLAB_ROWS();
+
+ //
+ // SUM UP THE DIFFERENCES
+ //
+ uint const valid_diffs = valid & diffs;
+ uint const count = popcount(valid_diffs);
+ uint const inclusive = sub_group_scan_inclusive_add(count);
+ uint const exclusive = inclusive - count;
+
+ //
+ // RESERVE SPACE IN THE INDICES ARRAY
+ //
+ uint next = 0;
+
+ if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
+ next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
+
+ // distribute base across subgroup
+ next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
+
+ //
+ // STORE THE INDICES
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (valid_diffs & (1 << prev)) \
+ indices[next++] = lane_idx + prev;
+
+ HS_SLAB_ROWS();
+
+ //
+ // TRANSPOSE THE SLAB AND STORE IT
+ //
+ HS_TRANSPOSE_SLAB();
+}
+
+//
+//
+//
diff --git a/src/compute/skc/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl index 28a9557ad7..e9accde307 100644 --- a/src/compute/skc/segment_ttrk.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl @@ -1,396 +1,394 @@ -/* - * Copyright 2018 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE -// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS -// KERNEL WILL NEED TO BE UPDATED -// - -#include <hs/cl/gen9/hs_cl_macros.h> - -#include "tile.h" -#include "raster_builder_cl_12.h" // need meta_in structure -#include "device_cl_12_gen9.h" - -// -// -// - -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) - -// -// THE BEST TYPE TO ZERO SMEM -// - -#define SKC_ZERO_TYPE ulong -#define SKC_ZERO_WORDS 2 - -// -// THE ORDER OF COMPONENTS IS: -// -// 0: blocks -// 1: offset -// 2: pk -// 3: rk -// - -#if (HS_KEYS_PER_SLAB < 256) - -#define SKC_META_TYPE uint -#define SKC_META_WORDS 1 - -#define SKC_COMPONENT_TYPE uchar - -#else - -#define SKC_META_TYPE uint2 -#define SKC_META_WORDS 2 - -#define SKC_COMPONENT_TYPE ushort - -#endif - -// -// -// - -#if ( SKC_TTRK_HI_BITS_COHORT <= 8) -#define SKC_COHORT_TYPE uchar -#else -#define SKC_COHORT_TYPE ushort -#endif - -// -// -// - -#define SKC_COHORT_ID(row) \ - as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT - -// -// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED -// - -#define SKC_IS_BLOCK(row) \ - ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) - -#define SKC_YX(row,prev) \ - (as_uint2(r##row).hi ^ as_uint2(r##prev).hi) - -#define SKC_IS_PK(row,prev) \ - ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X) - -// -// COHORT SIZE IS ALWAYS A POWER-OF-TWO -// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO -// -// COHORT SIZE >= SUBGROUP SIZE -// - -#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT) - -#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS) -#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE)) -#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2)) - -#define SKC_META_COMPONENTS 4 -#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE)) - -// -// -// - -__kernel -__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) -void -skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, - __global uint * SKC_RESTRICT const metas) -{ - __local union - { - SKC_META_TYPE volatile m[SKC_COHORT_SIZE]; - SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT]; - SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; - uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; - - // - // LOAD ALL THE ROWS - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; - - HS_SLAB_ROWS(); - - // - // LOAD LAST REGISTER FROM COLUMN TO LEFT - // - uint diffs = 0; - uint2 r0 = 0; - - if (gmem_base > 0) { - // if this is the first key in any slab but the first then it - // broadcast loads the last key in previous slab - r0.hi = as_uint2(vout[gmem_base - 1]).hi; - } else { - // otherwise broadcast the first key in the first slab - r0.hi = sub_group_broadcast(as_uint2(r1).hi,0); - // and mark it as an implicit diff - if (get_sub_group_local_id() == 0) - diffs = 1; - } - - // now shuffle in the last key from the column to the left - r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); - - // shift away y/x - SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT; - - // - // EXTRACT ALL COHORT IDS EARLY... - // -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row); - - HS_SLAB_ROWS(); - - // - // DEBUG - // -#if 0 - if (gmem_base == HS_KEYS_PER_SLAB * 7) - { - if (get_sub_group_local_id() == 0) - printf("\n%llX ",as_ulong(r0)); - else - printf("%llX ",as_ulong(r0)); -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (get_sub_group_local_id() == 0) \ - printf("\n%llX ",r##row); \ - else \ - printf("%llX ",r##row); - - HS_SLAB_ROWS(); - } -#endif - - // - // CAPTURE ALL CONDITIONS WE CARE ABOUT - // - // Diffs must be captured before cohorts - // - uint valid = 0; - uint blocks = 0; - uint pks = 0; - SKC_COHORT_TYPE c_max = 0; - - // - // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN - // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE - // -#if 0 - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - diffs |= ((c##row != c##prev) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - blocks |= (SKC_IS_BLOCK(row) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - pks |= SKC_IS_PK(row,prev) << prev); - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - valid |= ((r##row != SKC_ULONG_MAX) << prev); - - HS_SLAB_ROWS(); - -#else - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (c##row != c##prev) \ - diffs |= 1<<prev; - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (SKC_IS_BLOCK(row)) \ - blocks |= 1<<prev; - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (SKC_IS_PK(row,prev)) \ - pks |= 1<<prev; - - HS_SLAB_ROWS(); - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (r##row != SKC_ULONG_MAX) { \ - valid |= 1<<prev; \ - c_max = max(c_max,c##row); \ - } - - HS_SLAB_ROWS(); - -#endif - - // - // TRANSPOSE THE SLAB AND STORE IT - // - HS_TRANSPOSE_SLAB(); - - // the min cohort is the first key in the slab - uint const c_min = sub_group_broadcast(c1,0); - - // the max cohort is the max across all lanes - c_max = sub_group_reduce_max(c_max); - -#if 0 // REMOVE ME LATER - if (get_sub_group_local_id() == 0) - printf("%3u : ( %3u , %3u )\n", - get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max); -#endif - - // - // ZERO SMEM - // - // zero only the meta info for the cohort ids found in this slab - // -#if (SKC_ZERO_WORDS >= SKC_META_WORDS) - uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id(); - uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO; - - for (; zz<=zz_max; zz+=HS_LANES_PER_WARP) - shared.z[zz] = 0; -#else - // ERROR -- it's highly unlikely that the zero type is smaller than - // the meta type -#error("Unsupported right now...") -#endif - - // - // ACCUMULATE AND STORE META INFO - // - uint const valid_blocks = valid & blocks; - uint const valid_pks = valid & pks & ~diffs; - SKC_META_TYPE meta = ( 0 ); - -#define SKC_META_LOCAL_ADD(meta) \ - atomic_add(shared.m+HS_REG_LAST(c),meta); - -#define SKC_META_LOCAL_STORE(meta,prev) \ - shared.m[c##prev] = meta; - - // note this is purposefully off by +1 -#define SKC_META_RESET(meta,curr) \ - meta = ((gmem_off + curr) << 8); - -#if 0 - - // FIXME -- this can be tweaked to shift directly -#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ - meta += ((((blocks >> prev) & 1) ) | \ - (((pks >> prev) & 1) << 16) | \ - (((rks >> prev) & 1) << 24)); - -#else - -#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ - if (blocks & (1<<prev)) \ - meta += 1; \ - if (pks & (1<<prev)) \ - meta += 1<<16; \ - if (rks & (1<<prev)) \ - meta += 1<<24; - -#endif - -#undef HS_SLAB_ROW -#define HS_SLAB_ROW(row,prev) \ - if (diffs & (1<<prev)) { \ - SKC_META_LOCAL_STORE(meta,prev); \ - SKC_META_RESET(meta,row); \ - } \ - SKC_META_ADD(meta,prev, \ - valid_blocks, \ - valid_pks, \ - valid); - - HS_SLAB_ROWS(); - - // - // ATOMICALLY ADD THE CARRIED OUT METAS - // -#if 0 // BUG - if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0)) - SKC_META_LOCAL_ADD(meta); -#else - if (meta != 0) - SKC_META_LOCAL_ADD(meta); -#endif - - // - // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE - // - - // convert the slab offset to an extent offset - bool const is_offset = (get_sub_group_local_id() & 3) == 1; - uint const adjust = is_offset ? gmem_base - 1 : 0; - - // - // only process the meta components found in this slab - // - uint const cc_min = c_min * SKC_META_COMPONENTS; - uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1; - uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id(); - - if ((cc >= cc_min) && (cc <= cc_max)) - { - uint const c = shared.c[cc]; - - if (c != 0) - atomic_add(metas+cc,c+adjust); - } - - cc += HS_LANES_PER_WARP; - - for (; cc<=cc_max; cc+=HS_LANES_PER_WARP) - { - uint const c = shared.c[cc]; - - if (c != 0) - atomic_add(metas+cc,c+adjust); - } -} - -// -// -// +/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
+
+//
+// THE BEST TYPE TO ZERO SMEM
+//
+
+#define SKC_ZERO_TYPE ulong
+#define SKC_ZERO_WORDS 2
+
+//
+// THE ORDER OF COMPONENTS IS:
+//
+// 0: blocks
+// 1: offset
+// 2: pk
+// 3: rk
+//
+
+#if (HS_KEYS_PER_SLAB < 256)
+
+#define SKC_META_TYPE uint
+#define SKC_META_WORDS 1
+
+#define SKC_COMPONENT_TYPE uchar
+
+#else
+
+#define SKC_META_TYPE uint2
+#define SKC_META_WORDS 2
+
+#define SKC_COMPONENT_TYPE ushort
+
+#endif
+
+//
+//
+//
+
+#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
+#define SKC_COHORT_TYPE uchar
+#else
+#define SKC_COHORT_TYPE ushort
+#endif
+
+//
+//
+//
+
+#define SKC_COHORT_ID(row) \
+ as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
+
+//
+// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
+//
+
+#define SKC_IS_BLOCK(row) \
+ ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+
+#define SKC_YX(row,prev) \
+ (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
+
+#define SKC_IS_PK(row,prev) \
+ ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
+
+//
+// COHORT SIZE IS ALWAYS A POWER-OF-TWO
+// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
+//
+// COHORT SIZE >= SUBGROUP SIZE
+//
+
+#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
+
+#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
+#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
+#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+
+#define SKC_META_COMPONENTS 4
+#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+ __global uint * SKC_RESTRICT const metas)
+{
+ __local union
+ {
+ SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
+ SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
+ SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
+ } shared;
+
+ uint const global_id = get_global_id(0);
+ uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+ uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
+ uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+ //
+ // LOAD ALL THE ROWS
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+ HS_SLAB_ROWS();
+
+ //
+ // LOAD LAST REGISTER FROM COLUMN TO LEFT
+ //
+ uint diffs = 0;
+ uint2 r0 = 0;
+
+ if (gmem_base > 0) {
+ // if this is the first key in any slab but the first then it
+ // broadcast loads the last key in previous slab
+ r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+ } else {
+ // otherwise broadcast the first key in the first slab
+ r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
+ // and mark it as an implicit diff
+ if (get_sub_group_local_id() == 0)
+ diffs = 1;
+ }
+
+ // now shuffle in the last key from the column to the left
+ r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+ // shift away y/x
+ SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
+
+ //
+ // EXTRACT ALL COHORT IDS EARLY...
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
+
+ HS_SLAB_ROWS();
+
+ //
+ // DEBUG
+ //
+#if 0
+ if (gmem_base == HS_KEYS_PER_SLAB * 7)
+ {
+ if (get_sub_group_local_id() == 0)
+ printf("\n%llX ",as_ulong(r0));
+ else
+ printf("%llX ",as_ulong(r0));
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (get_sub_group_local_id() == 0) \
+ printf("\n%llX ",r##row); \
+ else \
+ printf("%llX ",r##row);
+
+ HS_SLAB_ROWS();
+ }
+#endif
+
+ //
+ // CAPTURE ALL CONDITIONS WE CARE ABOUT
+ //
+ // Diffs must be captured before cohorts
+ //
+ uint valid = 0;
+ uint blocks = 0;
+ uint pks = 0;
+ SKC_COHORT_TYPE c_max = 0;
+
+ //
+ // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
+ // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
+ //
+#if 0
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ diffs |= ((c##row != c##prev) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ blocks |= (SKC_IS_BLOCK(row) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ pks |= SKC_IS_PK(row,prev) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+ HS_SLAB_ROWS();
+
+#else
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (c##row != c##prev) \
+ diffs |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (SKC_IS_BLOCK(row)) \
+ blocks |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (SKC_IS_PK(row,prev)) \
+ pks |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (r##row != SKC_ULONG_MAX) { \
+ valid |= 1<<prev; \
+ c_max = max(c_max,c##row); \
+ }
+
+ HS_SLAB_ROWS();
+
+#endif
+
+ //
+ // TRANSPOSE THE SLAB AND STORE IT
+ //
+ HS_TRANSPOSE_SLAB();
+
+ // the min cohort is the first key in the slab
+ uint const c_min = sub_group_broadcast(c1,0);
+
+ // the max cohort is the max across all lanes
+ c_max = sub_group_reduce_max(c_max);
+
+#if 0 // REMOVE ME LATER
+ if (get_sub_group_local_id() == 0)
+ printf("%3u : ( %3u , %3u )\n",
+ get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+#endif
+
+ //
+ // ZERO SMEM
+ //
+ // zero only the meta info for the cohort ids found in this slab
+ //
+#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
+ uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
+ uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
+
+ for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+ shared.z[zz] = 0;
+#else
+ // ERROR -- it's highly unlikely that the zero type is smaller than
+ // the meta type
+#error("Unsupported right now...")
+#endif
+
+ //
+ // ACCUMULATE AND STORE META INFO
+ //
+ uint const valid_blocks = valid & blocks;
+ uint const valid_pks = valid & pks & ~diffs;
+ SKC_META_TYPE meta = ( 0 );
+
+#define SKC_META_LOCAL_ADD(meta) \
+ atomic_add(shared.m+HS_REG_LAST(c),meta);
+
+#define SKC_META_LOCAL_STORE(meta,prev) \
+ shared.m[c##prev] = meta;
+
+ // note this is purposefully off by +1
+#define SKC_META_RESET(meta,curr) \
+ meta = ((gmem_off + curr) << 8);
+
+#if 0
+
+ // FIXME -- this can be tweaked to shift directly
+#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
+ meta += ((((blocks >> prev) & 1) ) | \
+ (((pks >> prev) & 1) << 16) | \
+ (((rks >> prev) & 1) << 24));
+
+#else
+
+#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
+ if (blocks & (1<<prev)) \
+ meta += 1; \
+ if (pks & (1<<prev)) \
+ meta += 1<<16; \
+ if (rks & (1<<prev)) \
+ meta += 1<<24;
+
+#endif
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (diffs & (1<<prev)) { \
+ SKC_META_LOCAL_STORE(meta,prev); \
+ SKC_META_RESET(meta,row); \
+ } \
+ SKC_META_ADD(meta,prev, \
+ valid_blocks, \
+ valid_pks, \
+ valid);
+
+ HS_SLAB_ROWS();
+
+ //
+ // ATOMICALLY ADD THE CARRIED OUT METAS
+ //
+#if 0 // BUG
+ if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+ SKC_META_LOCAL_ADD(meta);
+#else
+ if (meta != 0)
+ SKC_META_LOCAL_ADD(meta);
+#endif
+
+ //
+ // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
+ //
+
+ // convert the slab offset to an extent offset
+ bool const is_offset = (get_sub_group_local_id() & 3) == 1;
+ uint const adjust = is_offset ? gmem_base - 1 : 0;
+
+ //
+ // only process the meta components found in this slab
+ //
+ uint const cc_min = c_min * SKC_META_COMPONENTS;
+ uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
+ uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
+
+ if ((cc >= cc_min) && (cc <= cc_max))
+ {
+ uint const c = shared.c[cc];
+
+ if (c != 0)
+ atomic_add(metas+cc,c+adjust);
+ }
+
+ cc += HS_LANES_PER_WARP;
+
+ for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+ {
+ uint const c = shared.c[cc];
+
+ if (c != 0)
+ atomic_add(metas+cc,c+adjust);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c index e915dffada..e915dffada 100644 --- a/src/compute/skc/path_builder_cl_12.c +++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c diff --git a/src/compute/skc/path_builder_cl_12.h b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h index 20bb13cbdf..20bb13cbdf 100644 --- a/src/compute/skc/path_builder_cl_12.h +++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h diff --git a/src/compute/skc/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c index 33992cbdfb..33992cbdfb 100644 --- a/src/compute/skc/raster_builder_cl_12.c +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c diff --git a/src/compute/skc/raster_builder_cl_12.h b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h index f6e1751ef1..f6e1751ef1 100644 --- a/src/compute/skc/raster_builder_cl_12.h +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h diff --git a/src/compute/skc/runtime_cl.c b/src/compute/skc/platforms/cl_12/runtime_cl.c index a745ed013e..a745ed013e 100644 --- a/src/compute/skc/runtime_cl.c +++ b/src/compute/skc/platforms/cl_12/runtime_cl.c diff --git a/src/compute/skc/runtime_cl.h b/src/compute/skc/platforms/cl_12/runtime_cl.h index 9e58ca0cc7..9e58ca0cc7 100644 --- a/src/compute/skc/runtime_cl.h +++ b/src/compute/skc/platforms/cl_12/runtime_cl.h diff --git a/src/compute/skc/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c index fca13edbbd..fca13edbbd 100644 --- a/src/compute/skc/runtime_cl_12.c +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c diff --git a/src/compute/skc/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h index 7e7ffcb284..7e7ffcb284 100644 --- a/src/compute/skc/runtime_cl_12.h +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h diff --git a/src/compute/skc/styling_cl_12.c b/src/compute/skc/platforms/cl_12/styling_cl_12.c index 6c84fe6f70..6c84fe6f70 100644 --- a/src/compute/skc/styling_cl_12.c +++ b/src/compute/skc/platforms/cl_12/styling_cl_12.c diff --git a/src/compute/skc/styling_cl_12.h b/src/compute/skc/platforms/cl_12/styling_cl_12.h index a319568ee5..a319568ee5 100644 --- a/src/compute/skc/styling_cl_12.h +++ b/src/compute/skc/platforms/cl_12/styling_cl_12.h diff --git a/src/compute/skc/surface_cl_12.h b/src/compute/skc/platforms/cl_12/surface_cl_12.h index 43ea5428a5..43ea5428a5 100644 --- a/src/compute/skc/surface_cl_12.h +++ b/src/compute/skc/platforms/cl_12/surface_cl_12.h diff --git a/src/compute/skc/surface_cl_12_buffer.c b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c index cc7cba5225..cc7cba5225 100644 --- a/src/compute/skc/surface_cl_12_buffer.c +++ b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c diff --git a/src/compute/skc/types.h b/src/compute/skc/types.h index 6d6d19aba2..655cea0ad4 100644 --- a/src/compute/skc/types.h +++ b/src/compute/skc/types.h @@ -38,12 +38,6 @@ // // -#include <stdbool.h> - -// -// -// - #define SKC_TYPE_HELPER(t) skc_##t #define SKC_TYPE(t) SKC_TYPE_HELPER(t) @@ -114,16 +108,16 @@ typedef cl_float16 skc_float16; typedef cl_half skc_half; -#if defined( __CL_HALF2__) +#if defined(__CL_HALF2__) typedef cl_half2 skc_half2; #endif -#if defined( __CL_HALF4__) +#if defined(__CL_HALF4__) typedef cl_half4 skc_half4; #endif -#if defined( __CL_HALF8__) +#if defined(__CL_HALF8__) typedef cl_half8 skc_half8; #endif -#if defined( __CL_HALF16__) +#if defined(__CL_HALF16__) typedef cl_half16 skc_half16; #endif @@ -206,16 +200,16 @@ typedef float16 skc_float16; typedef half skc_half; -#if defined( __CL_HALF2__) +#if defined(__CL_HALF2__) typedef half2 skc_half2; #endif -#if defined( __CL_HALF4__) +#if defined(__CL_HALF4__) typedef half4 skc_half4; #endif -#if defined( __CL_HALF8__) +#if defined(__CL_HALF8__) typedef half8 skc_half8; #endif -#if defined( __CL_HALF16__) +#if defined(__CL_HALF16__) typedef half16 skc_half16; #endif @@ -243,12 +237,6 @@ typedef half16 skc_half16; // // -#endif - -// -// -// - #define SKC_UCHAR_MAX 0xFF #define SKC_SHORT_MAX 0x7FFF @@ -265,3 +253,9 @@ typedef half16 skc_half16; // // +#endif + +// +// +// + |