From 879c98913c360b01f63588685c01ac06e83be54d Mon Sep 17 00:00:00 2001
From: Allan MacKinnon <allanmac@google.com>
Date: Wed, 20 Jun 2018 08:29:07 -0700
Subject: Overdue reorg of source tree to support multiple platforms & devices.

Bug: skia:
Change-Id: I1248a529a932ed5ef32952a1bb7eca56ee1c5f25
Reviewed-on: https://skia-review.googlesource.com/136170
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
---
 src/compute/skc/Makefile                           |   79 -
 src/compute/skc/allocator_device_cl.c              |  136 -
 src/compute/skc/allocator_device_cl.h              |   54 -
 src/compute/skc/atomic_cl.h                        |   72 -
 src/compute/skc/block_pool_cl.h                    |   60 -
 src/compute/skc/block_pool_cl_12.h                 |   33 -
 src/compute/skc/block_pool_init.cl                 |   64 -
 src/compute/skc/cl_20/extent.c                     |  787 -----
 src/compute/skc/cl_20/extent.h                     |  390 ---
 src/compute/skc/cl_20/ring_cl_svm_fine.cpp         |   89 -
 src/compute/skc/cl_20/ring_cl_svm_fine.h           |   46 -
 src/compute/skc/common.h                           |    2 +
 src/compute/skc/composition_cl_12.c                |  823 -----
 src/compute/skc/composition_cl_12.h                |  105 -
 src/compute/skc/config_cl.h                        |  147 -
 src/compute/skc/cq_pool_cl.c                       |  152 -
 src/compute/skc/cq_pool_cl.h                       |   46 -
 src/compute/skc/device_cl_12.h                     |   95 -
 src/compute/skc/device_cl_12_avx2.h                |   60 -
 src/compute/skc/device_cl_12_gen9.c                |  942 ------
 src/compute/skc/device_cl_12_gen9.h                |  335 --
 src/compute/skc/export_cl_12.h                     |   63 -
 src/compute/skc/extent_cl_12.c                     |  459 ---
 src/compute/skc/extent_cl_12.h                     |  476 ---
 src/compute/skc/extent_cl_12_unified.c             |  281 --
 src/compute/skc/fills_expand.cl                    |  309 --
 src/compute/skc/handle_pool_cl_12.c                |  752 -----
 src/compute/skc/handle_pool_cl_12.h                |  177 -
 src/compute/skc/interop.c                          |  629 ----
 src/compute/skc/interop.h                          |   42 -
 src/compute/skc/main.c                             |   20 +-
 src/compute/skc/make_all.bat                       |   15 -
 src/compute/skc/make_inl_cl.bat                    |   72 -
 src/compute/skc/path_builder_cl_12.c               | 1443 ---------
 src/compute/skc/path_builder_cl_12.h               |   44 -
 src/compute/skc/paths_copy.cl                      |  543 ----
 src/compute/skc/paths_reclaim.cl                   |  390 ---
 src/compute/skc/place.cl                           |  871 -----
 .../skc/platforms/cl_12/allocator_device_cl.c      |  136 +
 .../skc/platforms/cl_12/allocator_device_cl.h      |   54 +
 src/compute/skc/platforms/cl_12/atomic_cl.h        |   72 +
 src/compute/skc/platforms/cl_12/block_pool_cl.h    |   60 +
 src/compute/skc/platforms/cl_12/block_pool_cl_12.h |   33 +
 .../skc/platforms/cl_12/composition_cl_12.c        |  823 +++++
 .../skc/platforms/cl_12/composition_cl_12.h        |  105 +
 src/compute/skc/platforms/cl_12/config_cl.h        |  147 +
 src/compute/skc/platforms/cl_12/cq_pool_cl.c       |  152 +
 src/compute/skc/platforms/cl_12/cq_pool_cl.h       |   46 +
 src/compute/skc/platforms/cl_12/device_cl_12.h     |   95 +
 src/compute/skc/platforms/cl_12/export_cl_12.h     |   63 +
 src/compute/skc/platforms/cl_12/extent_cl_12.c     |  459 +++
 src/compute/skc/platforms/cl_12/extent_cl_12.h     |  476 +++
 .../skc/platforms/cl_12/extent_cl_12_unified.c     |  281 ++
 src/compute/skc/platforms/cl_12/gl/interop.c       |  629 ++++
 src/compute/skc/platforms/cl_12/gl/interop.h       |   42 +
 .../skc/platforms/cl_12/handle_pool_cl_12.c        |  752 +++++
 .../skc/platforms/cl_12/handle_pool_cl_12.h        |  177 +
 .../skc/platforms/cl_12/kernels/block_pool_init.cl |   64 +
 .../cl_12/kernels/devices/avx2/device_cl_12_avx2.h |   60 +
 .../cl_12/kernels/devices/gen9/device_cl_12.c      |  938 ++++++
 .../cl_12/kernels/devices/gen9/device_cl_12.h      |  341 ++
 .../cl_12/kernels/devices/gen9/inl/make_all.bat    |   15 +
 .../cl_12/kernels/devices/gen9/inl/make_inl_cl.bat |   85 +
 .../skc/platforms/cl_12/kernels/fills_expand.cl    |  309 ++
 .../skc/platforms/cl_12/kernels/paths_copy.cl      |  543 ++++
 .../skc/platforms/cl_12/kernels/paths_reclaim.cl   |  390 +++
 src/compute/skc/platforms/cl_12/kernels/place.cl   |  871 +++++
 src/compute/skc/platforms/cl_12/kernels/prefix.cl  | 1041 ++++++
 .../skc/platforms/cl_12/kernels/rasterize.cl       | 3366 +++++++++++++++++++
 .../skc/platforms/cl_12/kernels/rasters_alloc.cl   |  144 +
 .../skc/platforms/cl_12/kernels/rasters_reclaim.cl |  442 +++
 src/compute/skc/platforms/cl_12/kernels/render.cl  | 2165 +++++++++++++
 .../skc/platforms/cl_12/kernels/segment_ttck.cl    |  130 +
 .../skc/platforms/cl_12/kernels/segment_ttrk.cl    |  394 +++
 .../skc/platforms/cl_12/path_builder_cl_12.c       | 1443 +++++++++
 .../skc/platforms/cl_12/path_builder_cl_12.h       |   44 +
 .../skc/platforms/cl_12/raster_builder_cl_12.c     | 1349 ++++++++
 .../skc/platforms/cl_12/raster_builder_cl_12.h     |  165 +
 src/compute/skc/platforms/cl_12/runtime_cl.c       |  362 +++
 src/compute/skc/platforms/cl_12/runtime_cl.h       |   79 +
 src/compute/skc/platforms/cl_12/runtime_cl_12.c    |  314 ++
 src/compute/skc/platforms/cl_12/runtime_cl_12.h    |  177 +
 src/compute/skc/platforms/cl_12/styling_cl_12.c    |  339 ++
 src/compute/skc/platforms/cl_12/styling_cl_12.h    |   73 +
 src/compute/skc/platforms/cl_12/surface_cl_12.h    |   32 +
 .../skc/platforms/cl_12/surface_cl_12_buffer.c     |  453 +++
 src/compute/skc/prefix.cl                          | 1042 ------
 src/compute/skc/raster_builder_cl_12.c             | 1349 --------
 src/compute/skc/raster_builder_cl_12.h             |  165 -
 src/compute/skc/rasterize.cl                       | 3367 --------------------
 src/compute/skc/rasters_alloc.cl                   |  144 -
 src/compute/skc/rasters_reclaim.cl                 |  442 ---
 src/compute/skc/render.cl                          | 2165 -------------
 src/compute/skc/runtime_cl.c                       |  362 ---
 src/compute/skc/runtime_cl.h                       |   79 -
 src/compute/skc/runtime_cl_12.c                    |  314 --
 src/compute/skc/runtime_cl_12.h                    |  177 -
 src/compute/skc/segment_ttck.cl                    |  131 -
 src/compute/skc/segment_ttrk.cl                    |  396 ---
 src/compute/skc/styling_cl_12.c                    |  339 --
 src/compute/skc/styling_cl_12.h                    |   73 -
 src/compute/skc/surface_cl_12.h                    |   32 -
 src/compute/skc/surface_cl_12_buffer.c             |  453 ---
 src/compute/skc/types.h                            |   34 +-
 104 files changed, 20756 insertions(+), 22141 deletions(-)
 delete mode 100644 src/compute/skc/Makefile
 delete mode 100644 src/compute/skc/allocator_device_cl.c
 delete mode 100644 src/compute/skc/allocator_device_cl.h
 delete mode 100644 src/compute/skc/atomic_cl.h
 delete mode 100644 src/compute/skc/block_pool_cl.h
 delete mode 100644 src/compute/skc/block_pool_cl_12.h
 delete mode 100644 src/compute/skc/block_pool_init.cl
 delete mode 100644 src/compute/skc/cl_20/extent.c
 delete mode 100644 src/compute/skc/cl_20/extent.h
 delete mode 100644 src/compute/skc/cl_20/ring_cl_svm_fine.cpp
 delete mode 100644 src/compute/skc/cl_20/ring_cl_svm_fine.h
 delete mode 100644 src/compute/skc/composition_cl_12.c
 delete mode 100644 src/compute/skc/composition_cl_12.h
 delete mode 100644 src/compute/skc/config_cl.h
 delete mode 100644 src/compute/skc/cq_pool_cl.c
 delete mode 100644 src/compute/skc/cq_pool_cl.h
 delete mode 100644 src/compute/skc/device_cl_12.h
 delete mode 100644 src/compute/skc/device_cl_12_avx2.h
 delete mode 100644 src/compute/skc/device_cl_12_gen9.c
 delete mode 100644 src/compute/skc/device_cl_12_gen9.h
 delete mode 100644 src/compute/skc/export_cl_12.h
 delete mode 100644 src/compute/skc/extent_cl_12.c
 delete mode 100644 src/compute/skc/extent_cl_12.h
 delete mode 100644 src/compute/skc/extent_cl_12_unified.c
 delete mode 100644 src/compute/skc/fills_expand.cl
 delete mode 100644 src/compute/skc/handle_pool_cl_12.c
 delete mode 100644 src/compute/skc/handle_pool_cl_12.h
 delete mode 100644 src/compute/skc/interop.c
 delete mode 100644 src/compute/skc/interop.h
 delete mode 100644 src/compute/skc/make_all.bat
 delete mode 100644 src/compute/skc/make_inl_cl.bat
 delete mode 100644 src/compute/skc/path_builder_cl_12.c
 delete mode 100644 src/compute/skc/path_builder_cl_12.h
 delete mode 100644 src/compute/skc/paths_copy.cl
 delete mode 100644 src/compute/skc/paths_reclaim.cl
 delete mode 100644 src/compute/skc/place.cl
 create mode 100644 src/compute/skc/platforms/cl_12/allocator_device_cl.c
 create mode 100644 src/compute/skc/platforms/cl_12/allocator_device_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/atomic_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/block_pool_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/block_pool_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/composition_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/composition_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/config_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/cq_pool_cl.c
 create mode 100644 src/compute/skc/platforms/cl_12/cq_pool_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/device_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/export_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/extent_cl_12_unified.c
 create mode 100644 src/compute/skc/platforms/cl_12/gl/interop.c
 create mode 100644 src/compute/skc/platforms/cl_12/gl/interop.h
 create mode 100644 src/compute/skc/platforms/cl_12/handle_pool_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/handle_pool_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/place.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/prefix.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasterize.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/render.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
 create mode 100644 src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
 create mode 100644 src/compute/skc/platforms/cl_12/path_builder_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/path_builder_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/raster_builder_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl.c
 create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl.h
 create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/runtime_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/styling_cl_12.c
 create mode 100644 src/compute/skc/platforms/cl_12/styling_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/surface_cl_12.h
 create mode 100644 src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c
 delete mode 100644 src/compute/skc/prefix.cl
 delete mode 100644 src/compute/skc/raster_builder_cl_12.c
 delete mode 100644 src/compute/skc/raster_builder_cl_12.h
 delete mode 100644 src/compute/skc/rasterize.cl
 delete mode 100644 src/compute/skc/rasters_alloc.cl
 delete mode 100644 src/compute/skc/rasters_reclaim.cl
 delete mode 100644 src/compute/skc/render.cl
 delete mode 100644 src/compute/skc/runtime_cl.c
 delete mode 100644 src/compute/skc/runtime_cl.h
 delete mode 100644 src/compute/skc/runtime_cl_12.c
 delete mode 100644 src/compute/skc/runtime_cl_12.h
 delete mode 100644 src/compute/skc/segment_ttck.cl
 delete mode 100644 src/compute/skc/segment_ttrk.cl
 delete mode 100644 src/compute/skc/styling_cl_12.c
 delete mode 100644 src/compute/skc/styling_cl_12.h
 delete mode 100644 src/compute/skc/surface_cl_12.h
 delete mode 100644 src/compute/skc/surface_cl_12_buffer.c

(limited to 'src/compute')

diff --git a/src/compute/skc/Makefile b/src/compute/skc/Makefile
deleted file mode 100644
index e6516e3fd1..0000000000
--- a/src/compute/skc/Makefile
+++ /dev/null
@@ -1,79 +0,0 @@
-#
-# Copyright 2016 Google Inc.
-#
-# Use of this source code is governed by a BSD-style license that can
-# be found in the LICENSE file.
-#
-
-SRC		= block_pool_init.cl paths_copy.cl fills_expand.cl rasterize.cl raster_alloc.cl prefix.cl place.cl render.cl
-
-PRE		= $(SRC:%.cl=%.pre.cl)
-
-IR_GEN9		= $(PRE:%.cl=%.ir)
-
-$(info PRE     : $(PRE))
-$(info IR_GEN9 : $(IR_GEN9))
-
-#
-#
-#
-
-OPENCL_STD	= -cl-std=CL1.2
-OPENCL_PRE	= __OPENCL_C_VERSION__=120
-
-# OPENCL_STD	= -cl-std=CL2.0
-# OPENCL_PRE	= __OPENCL_C_VERSION__=200
-
-#
-#
-#
-
-TARGETS		= $(PRE) $(IR_GEN9)
-
-#
-#
-#
-
-IOC		= ioc64
-
-IOC_IR_OPTS_OPT = $(OPENCL_STD) -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable \
-		  -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
-
-IOC_IR_OPTS_DBG = $(OPENCL_STD) -cl-kernel-arg-info -g
-
-IOC_IR_OPTS     = $(IOC_IR_OPTS_OPT)
-
-#
-#
-#
-
-PRE_DEPS	= $(wildcard *.h)
-
-#
-#
-#
-
-all:		$(TARGETS)
-
-
-clean:
-		-rm -f $(TARGETS) $(wildcard *.pre.bin.inl) $(wildcard *.pre.src.inl) $(wildcard *.gen) $(wildcard *.TMP)
-
-#
-# PREPROCESS
-#
-
-$(PRE):		%.pre.cl: %.cl $(PRE_DEPS)
-		cl -I . -I "%INTELOCLSDKROOT%\include" -D $(OPENCL_PRE) -EP $< -P -Fi"$@"
-		clang-format -i $@
-		dos2unix $@
-		xxd -i $@ $(basename $@).src.inl
-
-#
-# GEN9 -- supports OpenCL 2.0 and can emit SPIR-V / SPIR-V TEXT but cannot load it via clCreateProgramWithIL()
-#
-
-$(IR_GEN9):	%.ir: %.cl
-		touch $@
-		$(IOC) -cmd=build -bo="$(IOC_IR_OPTS)" -device=gpu -input=$< -ir=$@ -asm
-		xxd -i $@ $(basename $@).bin.inl
diff --git a/src/compute/skc/allocator_device_cl.c b/src/compute/skc/allocator_device_cl.c
deleted file mode 100644
index aa44f36e87..0000000000
--- a/src/compute/skc/allocator_device_cl.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "runtime_cl_12.h"
-#include "config_cl.h"
-#include "common/cl/assert_cl.h"
-
-//
-// PERM
-//
-
-cl_mem
-skc_runtime_device_perm_alloc(struct skc_runtime * const runtime,
-                              cl_mem_flags         const flags,
-                              size_t               const size)
-{
-  cl_int cl_err;
-
-  cl_mem mem = clCreateBuffer(runtime->cl.context,
-                              flags,
-                              size,
-                              NULL,
-                              &cl_err); cl_ok(cl_err);
-  return mem;
-}
-
-void
-skc_runtime_device_perm_free(struct skc_runtime * const runtime,
-                             cl_mem               const mem)
-{
-  cl(ReleaseMemObject(mem));
-}
-
-//
-// TEMP
-//
-
-cl_mem
-skc_runtime_device_temp_alloc(struct skc_runtime * const runtime,
-                              cl_mem_flags         const flags,
-                              size_t               const size,
-                              skc_subbuf_id_t    * const subbuf_id,
-                              size_t             * const subbuf_size)
-{
-  if (size == 0)
-    {
-      *subbuf_id = (skc_subbuf_id_t)-1;
-
-      if (subbuf_size != NULL)
-        *subbuf_size = 0;
-      
-      return NULL;
-    }
-
-  cl_buffer_region br;
-
-  br.origin = skc_suballocator_subbuf_alloc(&runtime->allocator.device.temp.suballocator,
-                                            runtime->scheduler,
-                                            size,subbuf_id,&br.size);
-
-  if (subbuf_size != NULL)
-    *subbuf_size = br.size;
-
-  cl_int cl_err;
-
-  cl_mem mem = clCreateSubBuffer(runtime->allocator.device.temp.extent,
-                                 flags,
-                                 CL_BUFFER_CREATE_TYPE_REGION,
-                                 &br,
-                                 &cl_err); cl_ok(cl_err);
-
-  return mem;
-}
-
-
-void
-skc_runtime_device_temp_free(struct skc_runtime * const runtime, 
-                             cl_mem               const mem,
-                             skc_subbuf_id_t      const subbuf_id)
-{
-  if (mem == NULL)
-    return;
-
-  skc_suballocator_subbuf_free(&runtime->allocator.device.temp.suballocator,subbuf_id);
-
-  cl(ReleaseMemObject(mem));  
-}
-
-//
-//
-//
-
-void
-skc_allocator_device_create(struct skc_runtime * const runtime)
-{
-  skc_suballocator_create(runtime,
-                          &runtime->allocator.device.temp.suballocator,
-                          "DEVICE",
-                          runtime->config->suballocator.device.subbufs,
-                          runtime->cl.base_align,
-                          runtime->config->suballocator.device.size);
-
-#ifndef NDEBUG
-#pragma message("Get rid of CL_MEM_ALLOC_HOST_PTR as soon as the sorter is installed")
-  cl_mem_flags const flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
-#else
-  cl_mem_flags const flags = CL_MEM_READ_WRITE;
-#endif
-
-  runtime->allocator.device.temp.extent = 
-    skc_runtime_device_perm_alloc(runtime,
-                                  flags,
-                                  runtime->config->suballocator.device.size);
-}
-
-void
-skc_allocator_device_dispose(struct skc_runtime * const runtime)
-{
-  skc_suballocator_dispose(runtime,&runtime->allocator.device.temp.suballocator);
-
-  skc_runtime_device_perm_free(runtime,runtime->allocator.device.temp.extent);
-}
-
-//
-//
-//
-
diff --git a/src/compute/skc/allocator_device_cl.h b/src/compute/skc/allocator_device_cl.h
deleted file mode 100644
index 67d4e41398..0000000000
--- a/src/compute/skc/allocator_device_cl.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include <CL/opencl.h>
-
-//
-//
-//
-
-#include "suballocator.h"
-
-//
-//
-//
-
-struct skc_allocator_device
-{
-#if 0
-  struct {
-    
-  } perm;
-#endif
-
-  struct {
-    struct skc_suballocator suballocator;
-    cl_mem                  extent;
-  } temp;
-};
-
-//
-//
-//
-
-void
-skc_allocator_device_create(struct skc_runtime * const runtime);
-
-void
-skc_allocator_device_dispose(struct skc_runtime * const runtime);
-
-//
-//
-//
-
diff --git a/src/compute/skc/atomic_cl.h b/src/compute/skc/atomic_cl.h
deleted file mode 100644
index c196c36390..0000000000
--- a/src/compute/skc/atomic_cl.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_ONCE_ATOMIC_CL
-#define SKC_ONCE_ATOMIC_CL
-
-//
-// git cl upload is bleating about needing an #include before and #if
-// so we're unneccesarily reloading the types and OpenCL header
-//
-
-#include "types.h"
-
-#if (__OPENCL_C_VERSION__ <= 120 /*CL_VERSION_1_2*/)
-
-#define SKC_ATOMIC_UINT                             uint
-#define SKC_ATOMIC_INT                              int
-
-#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v)    atomic_add(p,v)
-#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v)  atomic_add(p,v)
-
-#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v)   atomic_add(p,v)
-#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v)
-
-#else // __OPENCL_C_VERSION__ > __CL_VERSION_1_2
-
-//
-// REMOVE THESE DEFINES ASAP -- ONLY HERE BECAUSE THE INTEL CODE
-// BUILDER UTILITY DOESN'T SUPPORT CREATING AN ATOMIC TYPE BUFFER
-//
-
-#ifdef SKC_SUPPORT_BROKEN_INTEL_CODE_BUILDER
-
-#define SKC_ATOMIC_UINT                             uint
-#define SKC_ATOMIC_CAST_LOCAL(p)                    (__local  atomic_uint volatile * restrict const)(p)
-#define SKC_ATOMIC_CAST_GLOBAL(p)                   (__global atomic_uint volatile * restrict const)(p)
-
-#else
-
-#define SKC_ATOMIC_UINT                             atomic_uint
-#define SKC_ATOMIC_CAST_LOCAL(p)                    (p)
-#define SKC_ATOMIC_CAST_GLOBAL(p)                   (p)
-
-#endif
-
-
-#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v)    atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \
-                                                                              v,memory_order_relaxed,memory_scope_device)
-#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v)  atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \
-                                                                              v,memory_order_relaxed,memory_scope_sub_group)
-
-#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v)   atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \
-                                                                              v,memory_order_relaxed,memory_scope_device)
-#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \
-                                                                              v,memory_order_relaxed,memory_scope_sub_group)
-
-#endif
-
-//
-//
-//
-
-#endif // SKC_ONCE_ATOMIC_CL
-
-//
-//
-//
diff --git a/src/compute/skc/block_pool_cl.h b/src/compute/skc/block_pool_cl.h
deleted file mode 100644
index c88370919e..0000000000
--- a/src/compute/skc/block_pool_cl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_ONCE_BLOCK_POOL
-#define SKC_ONCE_BLOCK_POOL
-
-//
-//
-//
-
-#include "types.h"
-
-//
-//
-//
-
-union skc_block_pool_size
-{
-  skc_uint3   u32v3;
-
-  struct {
-    skc_uint  pool_size; // number of blocks
-    skc_uint  ring_pow2; // rounded-up pow2 of pool_size
-    skc_uint  ring_mask; // ring_pow2 - 1
-  };
-};
-
-//
-//
-//
-
-union skc_block_pool_atomic
-{
-  skc_uint2  u32v2;
-
-  skc_uint   u32a2[2];
-
-  struct {
-    skc_uint reads;
-    skc_uint writes;
-  };
-};
-
-#define SKC_BP_ATOMIC_OFFSET_READS   0
-#define SKC_BP_ATOMIC_OFFSET_WRITES  1
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/block_pool_cl_12.h b/src/compute/skc/block_pool_cl_12.h
deleted file mode 100644
index 6fa8a39ca0..0000000000
--- a/src/compute/skc/block_pool_cl_12.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "block_pool_cl.h"
-#include "extent_cl_12.h"
-
-//
-// device side block pool
-//
-
-struct skc_block_pool
-{
-  union  skc_block_pool_size const * size;
-
-  struct skc_extent_pdrw             blocks;
-  struct skc_extent_pdrw             ids;
-  struct skc_extent_phr_pdrw         atomics;
-};
-
-//
-//
-//
diff --git a/src/compute/skc/block_pool_init.cl b/src/compute/skc/block_pool_init.cl
deleted file mode 100644
index 023dff44cf..0000000000
--- a/src/compute/skc/block_pool_init.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-
-//
-// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
-//
-
-__kernel
-SKC_BP_INIT_IDS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
-{
-  uint const gid = get_global_id(0);
-
-  //
-  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
-  // accomplish this with fewer threads and using either IPC and/or
-  // vector stores -- it should be on certain architectures!
-  //
-
-  //
-  // initialize pool with sequence
-  //
-  if (gid < bp_size)
-    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
-}
-
-//
-//
-//
-
-__kernel
-SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
-{
-  // the version test is to squelch a bug with the Intel OpenCL CPU
-  // compiler declaring it supports the cl_intel_subgroups extension
-#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
-  uint const tid = get_sub_group_local_id();
-#else
-  uint const tid = get_local_id(0);
-#endif
-
-  //
-  // launch two threads and store [ 0, bp_size ]
-  //
-  bp_atomics[tid] = tid * bp_size;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/cl_20/extent.c b/src/compute/skc/cl_20/extent.c
deleted file mode 100644
index 4c073e8b69..0000000000
--- a/src/compute/skc/cl_20/extent.c
+++ /dev/null
@@ -1,787 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <string.h>
-
-// #include "extent.h"
-
-//
-// EXTENT TYPES
-//
-// Classification of operations on allocated GPU memory
-//
-// h  = host
-// d  = device
-//
-// c  = append using non-atomic incremented count
-// x  = append using atomically incremented index
-// p  = allocated from pool of indices
-// g  = gathered by pull kernel
-// s  = size is available
-//
-// w1 = write once
-// wN = write many
-//
-// r1 = read once
-// rN = read many
-//
-// rw = read/write many
-//
-//                                host<>device memory model
-//                       +--------------------+--------------------+
-//     extent type       |      split         |      shared        |  examples
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   extent_atomic       |   device+mapped    |   device+mapped    |  atomically op'd device extent + read-only host snapshot
-//                       |                    |                    |
-//   extent_dxrw         |      device        |      device        |  ttsk_array, ttpk_array, ttck_array, *_offsets
-//   extent_hcw1_dr1     |      mapped        |      mapped        |  command_queue, buffer
-//   extent_hcrw         |       host         |       host         |  queue
-//                       |                    |                    |
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   extent_hcw1_drN     |     memcpy'd       |      mapped        |  stack_transforms, stack_stroke_props
-//   extent_hgw1_drN     |   scatter/gather   |      mapped        |  layer_props
-//                       |                    |                    |
-//   block_pool_dprw     |      device        |      device        |  ttsb_pool, ttpb_pool
-//   block_pool_hp_drw   |      device        |      device        |  raster_pool
-//                       |                    |                    |
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   block_pool_hp_drw   | block_pool_hp_drw  | block_pool_hp_drw  |  path_block_pool
-//   staging buffer      | extent_hw_dr       |        --          |
-//                       |                    |                    |
-//
-
-//
-// HIGH-LEVEL EXTENTS ARE BUILT FROM SIMPLER STRUCTURES
-//
-
-//
-// COUNTERS FOR POOLS -- TYPICALLY ATOMIC WHEN ON DEVICE
-//
-
-union skc_ring
-{
-  skc_uint2  u32v2;
-
-  skc_uint   u32a2[2];
-  
-  struct {
-    skc_uint reads;  // number of reads
-    skc_uint writes; // number of writes
-  };
-};
-
-//
-// POOL OF INDICES TO BLOCKS
-//
-
-struct skc_pool_h
-{
-  skc_uint * indices;
-};
-
-struct skc_pool_d
-{
-  cl_mem   * indices; // FIXME -- READ POOL INDICES THROUGH CONSTANT CACHE?
-};
-
-//
-// LOW-LEVEL EXTENTS -- SIZES ARE STORED ELSEWHERE
-//
-
-struct skc_extent_hrw
-{
-  void * hrw; // host pointer to host extent -- read/write
-};
-
-struct skc_extent_drw
-{
-  cl_mem drw; // device pointer to device extent -- read/write
-};
-
-struct skc_extent_hw_dr
-{
-  void * hw;  // host   pointer to shared extent -- write-only + write-combined
-  cl_mem dr;  // device pointer to shared extent -- read-only
-};
-
-//
-//
-//
-
-#if 0
-static
-void * 
-skc_runtime_svm_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size)
-{
-  return clSVMAlloc(runtime_cl->context,
-                    CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
-                    size,
-                    0);
-}
-
-static
-void * 
-skc_runtime_svm_atomic_alloc(struct skc_runtime_cl * const runtime_cl, size_t const size) // WE DON'T NEED THIS HERE
-{
-  return clSVMAlloc(runtime_cl->context,
-                    CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
-                    size,
-                    0);
-}
-
-static 
-void
-skc_runtime_svm_free(struct skc_runtime_cl * const runtime_cl, void * const buffer)
-{
-  clSVMFree(runtime_cl->context,buffer);
-}
-#endif
-
-//
-//
-//
-
-void 
-skc_command_queue_fill_device(struct skc_command_queue * const cq,
-                              cl_mem                           buffer, 
-                              void               const * const pattern, 
-                              size_t                     const pattern_size,
-                              size_t                     const size);
-
-void *
-skc_command_queue_map_wi(struct skc_command_queue * const cq,
-                         cl_mem                           buffer);
-
-void
-skc_command_queue_unmap(struct skc_command_queue * const cq, 
-                        cl_mem                           buffer, 
-                        void                     * const mapped);
-
-void
-skc_command_queue_read(struct skc_command_queue * const cq, 
-                       cl_mem                           buffer, 
-                       void                     * const ptr);
-
-//
-//
-//
-
-struct skc_extent_hrw *
-skc_extent_hrw_alloc(struct skc_allocator * const allocator,
-                     size_t                 const size)
-{
-  struct skc_extent_hrw * extent;
-  
-  extent      = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hrw = skc_allocator_alloc_host(allocator,size);
-
-  return extent;
-}
-
-
-
-void
-skc_extent_hrw_free(struct skc_allocator  * const allocator,
-                    struct skc_extent_hrw * const extent)
-{
-  skc_allocator_free_host(allocator,extent->hrw);
-  skc_allocator_free_host(allocator,extent);
-}
-
-//
-//
-//
-
-struct skc_extent_drw *
-skc_extent_drw_alloc(struct skc_allocator * const allocator,
-                     size_t                 const size)
-{
-  struct skc_extent_drw * extent;
-
-  extent      = skc_allocator_alloc_host  (allocator,sizeof(*extent));
-  extent->drw = skc_allocator_alloc_device(allocator,size);
-
-  return extent;
-}
-
-void
-skc_extent_drw_free(struct skc_allocator  * const allocator,
-                    struct skc_extent_drw * const extent)
-{
-  skc_allocator_free_device(allocator,extent->drw);
-  skc_allocator_free_host  (allocator,extent);
-}
-
-void
-skc_extent_drw_fill(struct skc_command_queue * const cq,
-                    struct skc_extent_drw    * const extent,
-                    void               const * const pattern,
-                    size_t                     const pattern_size,
-                    size_t                     const size)
-{
-  skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size);
-}
-
-//
-// WRITE-COMBINED / WRITE-INVALIDATE
-//
-
-struct skc_extent_hw_dr *
-skc_extent_hw_dr_alloc(struct skc_allocator * const allocator,
-                       size_t                 const size)
-{
-  struct skc_extent_hw_dr * extent;
-
-  extent     = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hw = NULL;
-  extent->dr = skc_allocator_alloc_device_wc(allocator,size); // write-combined mem
-
-  return extent;
-}
-
-void
-skc_extent_hw_dr_free(struct skc_allocator    * const allocator,
-                      struct skc_extent_hw_dr * const extent)
-{
-  skc_allocator_free_device(allocator,extent->dr);
-  skc_allocator_free_host  (allocator,extent);
-}
-
-void
-skc_extent_hw_dr_map(struct skc_command_queue * const cq,
-                     struct skc_extent_hw_dr  * const extent)
-{
-  extent->hw = skc_command_queue_map_wi(cq,extent->dr);
-}
-
-void
-skc_extent_hw_dr_unmap(struct skc_command_queue * const cq,
-                       struct skc_extent_hw_dr  * const extent)
-{
-  skc_command_queue_unmap(cq,extent->dr,extent->hw);
-}
-
-void
-skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent,
-                        void const * SKC_RESTRICT const src, 
-                        size_t                    const offset, 
-                        size_t                    const size)
-{
-  void * SKC_RESTRICT const dst = (char *)extent->hw + offset;
-
-  memcpy(dst,src,size);
-}
-//
-// SNAPSHOT
-//
-
-struct skc_extent_hr_drw
-{
-  void * hr;  // host   pointer to shared extent -- readable snapshot
-  cl_mem drw; // device pointer to shared extent -- read/write
-};
-
-struct skc_extent_hr_drw *
-skc_extent_hr_drw_alloc(struct skc_allocator * const allocator,
-                        size_t                 const size)
-{
-  struct skc_extent_hr_drw * extent;
-
-  extent      = skc_allocator_alloc_host  (allocator,sizeof(*extent));
-  extent->hr  = skc_allocator_alloc_host  (allocator,size);
-  extent->drw = skc_allocator_alloc_device(allocator,size);
-
-  return extent;
-}
-
-void
-skc_extent_hr_drw_free(struct skc_allocator     * const allocator,
-                       struct skc_extent_hr_drw * const extent)
-{
-  skc_allocator_free_host  (allocator,extent->hr);
-  skc_allocator_free_device(allocator,extent->drw);
-  skc_allocator_free_host  (allocator,extent);
-}
-
-void
-skc_extent_hr_drw_snap(struct skc_command_queue * const cq,
-                       struct skc_extent_hr_drw * const extent,
-                       size_t                     const size)
-{
-  skc_command_queue_read(cq,extent->drw,extent->hr);
-}
-
-void
-skc_extent_hr_drw_fill(struct skc_command_queue * const cq,
-                       struct skc_extent_hr_drw * const extent,
-                       void               const * const pattern,
-                       size_t                     const pattern_size,
-                       size_t                     const size)
-{
-  skc_command_queue_fill_device(cq,extent->drw,pattern,pattern_size,size);
-}
-
-//
-//
-//
-
-struct skc_extent_atomic
-{
-  struct skc_extent_hr_drw * hr_drw;
-  size_t                     size; // typically a very small extent
-};
-
-//
-//
-//
-
-struct skc_extent_atomic *
-skc_extent_atomic_alloc(struct skc_allocator * const allocator,
-                        size_t                 const size)
-{
-  struct skc_extent_atomic * extent;
-
-  extent         = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hr_drw = skc_extent_hr_drw_alloc(allocator,size);
-  extent->size   = size;
-
-  return extent;
-}
-
-void
-skc_extent_atomic_free(struct skc_allocator     * const allocator,
-                       struct skc_extent_atomic * const extent)
-{
-  skc_extent_hr_drw_free (allocator,extent->hr_drw);
-  skc_allocator_free_host(allocator,extent);
-}
-
-void
-skc_extent_atomic_snap(struct skc_command_queue       * const cq,
-                       struct skc_extent_atomic const * const extent)
-{
-  skc_extent_hr_drw_snap(cq,extent->hr_drw,extent->size);
-}
-
-void
-skc_extent_atomic_zero(struct skc_command_queue       * const cq,
-                       struct skc_extent_atomic const * const extent)
-{
-  skc_uint const zero = 0;
-
-  skc_extent_hr_drw_fill(cq,extent->hr_drw,&zero,sizeof(zero),extent->size);
-}
-
-//
-//
-//
-
-struct skc_extent_dxrw
-{
-  struct skc_extent_drw    * drw;
-
-  size_t                     elem_size;
-  skc_uint                   elem_count;
-
-#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED
-  struct skc_extent_atomic * atomic;
-  size_t                     atomic_offset;
-#endif
-};
-
-//
-//
-//
-
-struct skc_extent_dxrw *
-skc_extent_dxrw_alloc(struct skc_allocator     * const allocator,
-                      size_t                     const elem_size,
-                      skc_uint                   const elem_count,
-                      struct skc_extent_atomic * const atomic,
-                      size_t                     const atomic_offset)
-{
-  struct skc_extent_dxrw * extent;
-
-  extent                = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->drw           = skc_extent_drw_alloc(allocator,elem_size * elem_count);
-
-  extent->elem_size     = elem_size;
-  extent->elem_count    = elem_count;
-
-  //
-  // note that passing in the atomic and its member has no real use at
-  // this point since the current programming style requires passing
-  // in the atomic extent -- which may have multiple members -- to the
-  // compute kernel
-  //
-#if 0 // SKC_EXTENT_ATOMIC_IS_IGNORED  
-  extent->atomic        = atomic;
-  extent->atomic_offset = atomic_offset;
-#endif
-  
-  return extent;
-}
-
-void
-skc_extent_dxrw_free(struct skc_allocator   * const allocator,
-                     struct skc_extent_dxrw * const extent)
-{
-  skc_extent_drw_free    (allocator,extent->drw);
-  skc_allocator_free_host(allocator,extent);
-}
-
-//
-//
-//
-
-struct skc_extent_hcrw
-{
-  struct skc_extent_hrw * hrw;
-  size_t                  elem_size;
-  skc_uint                elem_count;
-  skc_uint                counter;
-};
-
-//
-//
-//
-
-struct skc_extent_hcrw *
-skc_extent_hcrw_alloc(struct skc_allocator * const allocator,
-                      size_t                 const elem_size,
-                      skc_uint               const elem_count)
-{
-  struct skc_extent_hcrw * extent;
-
-  extent             = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hrw        = skc_extent_hrw_alloc(allocator,elem_size * elem_count);
-  extent->elem_size  = elem_size;
-  extent->elem_count = elem_count;
-  extent->counter    = 0;
-
-  return extent;
-}
-
-void
-skc_extent_hcrw_free(struct skc_allocator   * const allocator,
-                     struct skc_extent_hcrw * const extent)
-{
-  skc_extent_hrw_free    (allocator,extent->hrw);
-  skc_allocator_free_host(allocator,extent);
-}
-
-void
-skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent)
-{
-  extent->counter = 0;
-}
-
-skc_bool
-skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent)
-{
-  return (extent->counter == extent->elem_count);
-}
-
-//
-//
-//
-
-struct skc_extent_hcw1_dr1
-{
-  struct skc_extent_hw_dr * hw_dr; // mapped memory
-  size_t                    elem_size;
-  skc_uint                  elem_count;
-  skc_uint                  counter;
-};
-
-//
-//
-//
-
-struct skc_extent_hcw1_dr1 *
-skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator,
-                          skc_uint               const elem_size,
-                          skc_uint               const elem_count)
-{
-  struct skc_extent_hcw1_dr1 * extent;
-
-  extent             = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hw_dr      = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count);
-  extent->elem_size  = elem_size;
-  extent->elem_count = elem_count;
-  extent->counter    = 0;
-
-  return extent;
-}
-
-void
-skc_extent_hcw1_dr1_free(struct skc_allocator       * const allocator,
-                         struct skc_extent_hcw1_dr1 * const extent)
-{
-  skc_extent_hw_dr_free  (allocator,extent->hw_dr);
-  skc_allocator_free_host(allocator,extent);
-}
-
-void
-skc_extent_hcw1_dr1_map(struct skc_command_queue   * const cq,
-                        struct skc_extent_hcw1_dr1 * const extent)
-{
-  skc_extent_hw_dr_map(cq,extent->hw_dr);
-}
-
-void
-skc_extent_hcw1_dr1_unmap(struct skc_command_queue   * const cq,
-                          struct skc_extent_hcw1_dr1 * const extent)
-{
-  skc_extent_hw_dr_unmap(cq,extent->hw_dr);
-}
-
-void
-skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent)
-{
-  extent->counter = 0;
-}
-
-skc_bool
-skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent)
-{
-  return (extent->counter == extent->elem_count);
-}
-
-skc_uint
-skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent)
-{
-  return extent->elem_count - extent->counter;
-}
-
-void
-skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent,
-                           void  const * SKC_RESTRICT   const elem_ptr,
-                           skc_uint                     const elem_count_clamped)
-{
-  skc_extent_hw_dr_memcpy(extent->hw_dr,
-                          elem_ptr,
-                          extent->elem_size * extent->counter,
-                          extent->elem_size * elem_count_clamped);
-}
-
-//
-//
-//
-
-struct skc_extent_hcw1_drN_unified
-{
-  struct skc_extent_hw_dr * hw_dr; // mapped memory
-  size_t                    elem_size;
-  skc_uint                  elem_count;
-  skc_uint                  counter;
-};
-
-//
-//
-//
-
-struct skc_extent_hcw1_drN_unified *
-skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator,
-                                  skc_uint               const elem_size,
-                                  skc_uint               const elem_count)
-{
-  struct skc_extent_hcw1_drN_unified * extent;
-
-  extent             = skc_allocator_alloc_host(allocator,sizeof(*extent));
-  extent->hw_dr      = skc_extent_hw_dr_alloc(allocator,elem_size * elem_count);
-  extent->elem_size  = elem_size;
-  extent->elem_count = elem_count;
-  extent->counter    = 0;
-
-  return extent;
-}
-
-void
-skc_extent_hcw1_drN_unified_free(struct skc_allocator               * const allocator,
-                                 struct skc_extent_hcw1_drN_unified * const extent)
-{
-  skc_extent_hw_dr_free  (allocator,extent->hw_dr);
-  skc_allocator_free_host(allocator,extent);
-}
-
-void
-skc_extent_hcw1_drN_unified_map(struct skc_command_queue           * const cq,
-                                struct skc_extent_hcw1_drN_unified * const extent)
-{
-  skc_extent_hw_dr_map(cq,extent->hw_dr);
-}
-
-
-void
-skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue           * const cq,
-                                  struct skc_extent_hcw1_drN_unified * const extent)
-{
-  skc_extent_hw_dr_unmap(cq,extent->hw_dr);
-}
-
-void
-skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent)
-{
-  extent->counter = 0;
-}
-
-skc_bool
-skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent)
-{
-  return (extent->counter == extent->elem_count);
-}
-
-
-skc_uint
-skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent)
-{
-  return extent->elem_count - extent->counter;
-}
-
-
-void
-skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent,
-                                   void            const * SKC_RESTRICT const elem_ptr,
-                                   skc_uint                             const elem_count_clamped)
-{
-  skc_extent_hw_dr_memcpy(extent->hw_dr,
-                          elem_ptr,
-                          extent->elem_size * extent->counter,
-                          extent->elem_size * elem_count_clamped);
-}
-
-//
-//
-//
-
-struct skc_id_pool_hp *
-skc_id_pool_hp_alloc(struct skc_allocator * const allocator,
-                     skc_uint               const count)
-{
-  return NULL;
-}
-
-void
-skc_id_pool_hp_free(struct skc_allocator  * const allocator,
-                    struct skc_id_pool_hp * const extent)
-{
-  ;
-}
-
-void
-skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, 
-                       skc_uint              * const id)
-{
-  ;
-}
-
-void
-skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, 
-                         skc_uint                const id)
-{
-  ;
-}
-
-void
-skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, 
-                         skc_uint        const * const id, 
-                         skc_uint                const count)
-{
-  ;
-}
-
-//
-//
-//
-
-struct skc_block_pool_dprw *
-skc_block_pool_dprw_alloc(struct skc_allocator * const allocator,
-                          union skc_ring       * const ring_d,
-                          skc_uint               const block_size,
-                          skc_uint               const block_count)
-{
-  return NULL;
-}
-
-void
-skc_block_pool_dprw_free(struct skc_allocator       * const allocator,
-                         struct skc_block_pool_dprw * const extent)
-{
-  ;
-}
-
-//
-//
-//
-
-struct skc_extent_hgw1_drN *
-skc_extent_hgw1_drN_alloc(struct skc_allocator * const allocator,
-                          skc_uint               const elem_size,
-                          skc_uint               const elem_count)
-{
-  return NULL;
-}
-
-void
-skc_extent_hgw1_drN_free(struct skc_allocator       * const allocator,
-                         struct skc_extent_hgw1_drN * const extent)
-{
-  ;
-}
-
-void
-skc_extent_hgw1_drN_reset(struct skc_extent_hgw1_drN * const extent)
-{
-  ;
-}
-
-void
-skc_extent_hgw1_drN_snap(struct skc_command_queue         * const cq,
-                         struct skc_extent_hgw1_drN const * const extent)
-{
-  ;
-}
-
-//
-//
-//
-
-#if 0
-
-//
-//
-//
-
-struct skc_block_pool_hp_drw *
-skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator,
-                            skc_uint               const elem_size,
-                            skc_uint               const elem_count)
-{
-  return NULL;
-}
-
-void
-skc_block_pool_hp_drw_free(struct skc_allocator         * const allocator,
-                           struct skc_block_pool_hp_drw * const extent)
-{
-  ;
-}
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/cl_20/extent.h b/src/compute/skc/cl_20/extent.h
deleted file mode 100644
index 2993968a50..0000000000
--- a/src/compute/skc/cl_20/extent.h
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "skc.h"
-#include "allocator.h"
-
-//
-// EXTENT TYPES
-//
-// Classification of operations on allocated GPU memory
-//
-// h  = host
-// d  = device
-//
-// c  = append using non-atomic incremented count
-// x  = append using atomically incremented index
-// p  = allocated from pool of indices
-// g  = gathered by pull kernel
-//
-// w1 = write once
-// wN = write many
-//
-// r1 = read once
-// rN = read many
-//
-// rw = read/write many
-//
-//                                host<>device memory model
-//                       +--------------------+--------------------+
-//     extent type       |      split         |      shared        |  examples
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   extent_atomic       |   device+mapped    |   device+mapped    |  atomically op'd device extent + read-only host snapshot
-//                       |                    |                    |
-//   extent_dxrw         |      device        |      device        |  ttsk_array, ttpk_array, ttck_array, *_offsets
-//   extent_hcw1_dr1     |      mapped        |      mapped        |  command_queue, buffer
-//   extent_hcrw         |       host         |       host         |  queue
-//                       |                    |                    |
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   extent_hcw1_drN     |     memcpy'd       |      mapped        |  stack_transforms, stack_stroke_props
-//   extent_hgw1_drN     |   scatter/gather   |      mapped        |  layer_props
-//                       |                    |                    |
-//   block_pool_dprw     |      device        |      device        |  ttsb_pool, ttpb_pool
-//   block_pool_hp_drw   |      device        |      device        |  raster_pool
-//                       |                    |                    |
-//  ---------------------+--------------------+--------------------+--------------------
-//                       |                    |                    |
-//   block_pool_hp_drw   | block_pool_hp_drw  | block_pool_hp_drw  |  path_block_pool
-//   staging buffer      | extent_hw_dr       |        --          |
-//                       |                    |                    |
-//
-
-struct skc_extent_hrw;
-struct skc_extent_drw;
-
-struct skc_extent_hrw_drN;
-struct skc_extent_hw1_drN;
-struct skc_extent_hrN_drw;
-
-struct skc_extent_atomic;
-
-struct skc_extent_hcrw;
-struct skc_extent_dxrw;
-
-struct skc_block_pool_dprw;
-
-struct skc_id_pool_hp;
-
-struct skc_extent_hcw1_dr1;
-struct skc_extent_hcw1_drN;
-struct skc_extent_hgw1_drN;
-
-//
-//
-//
-
-void *
-skc_extent_hrw_drN_get_hrw(struct skc_extent_hrw_drN * extent);
-
-void *
-skc_extent_hw1_drN_get_hw1(struct skc_extent_hw1_drN * extent);
-
-//
-//
-//
-
-struct skc_extent_hrw *
-skc_extent_hrw_alloc(struct skc_allocator * const allocator,
-                     size_t                 const size);
-
-void
-skc_extent_hrw_free(struct skc_allocator  * const allocator,
-                    struct skc_extent_hrw * const extent);
-
-void *
-skc_extent_hrw_get_hrw(struct skc_extent_hrw * extent);
-
-//
-//
-//
-
-struct skc_extent_drw *
-skc_extent_drw_alloc(struct skc_allocator * const allocator,
-                     size_t                 const size);
-
-void
-skc_extent_drw_free(struct skc_allocator  * const allocator,
-                    struct skc_extent_drw * const extent);
-
-void
-skc_extent_drw_fill(struct skc_command_queue * const cq,
-                    struct skc_extent_drw    * const extent,
-                    void               const * const pattern,
-                    size_t                     const pattern_size,
-                    size_t                     const size);
-
-//
-//
-//
-
-struct skc_extent_hw_dr *
-skc_extent_hw_dr_alloc(struct skc_allocator * const allocator,
-                       size_t                 const size);
-
-void
-skc_extent_hw_dr_free(struct skc_allocator    * const allocator,
-                      struct skc_extent_hw_dr * const extent);
-
-void
-skc_extent_hw_dr_map(struct skc_command_queue * const cq,
-                     struct skc_extent_hw_dr  * const extent);
-
-void
-skc_extent_hw_dr_unmap(struct skc_command_queue * const cq,
-                       struct skc_extent_hw_dr  * const extent);
-
-void
-skc_extent_hw_dr_memcpy(struct skc_extent_hw_dr * const extent,
-                        void const * SKC_RESTRICT const src, 
-                        size_t                    const offset, 
-                        size_t                    const size);
-//
-//
-//
-
-struct skc_extent_hr_drw *
-skc_extent_hr_drw_alloc(struct skc_allocator * const allocator,
-                        size_t                 const size);
-
-void
-skc_extent_hr_drw_free(struct skc_allocator     * const allocator,
-                       struct skc_extent_hr_drw * const extent);
-
-void
-skc_extent_hr_drw_snap(struct skc_command_queue * const cq,
-                       struct skc_extent_hr_drw * const extent,
-                       size_t                     const size);
-
-void
-skc_extent_hr_drw_fill(struct skc_command_queue * const cq,
-                       struct skc_extent_hr_drw * const extent,
-                       void               const * const pattern,
-                       size_t                     const pattern_size,
-                       size_t                     const size);
-
-//
-//
-//
-
-struct skc_extent_atomic *
-skc_extent_atomic_alloc(struct skc_allocator * const allocator,
-                        size_t                 const size);
-
-void
-skc_extent_atomic_free(struct skc_allocator     * const allocator,
-                       struct skc_extent_atomic * const extent);
-
-void
-skc_extent_atomic_snap(struct skc_command_queue       * const cq,
-                       struct skc_extent_atomic const * const extent);
-
-void
-skc_extent_atomic_zero(struct skc_command_queue       * const cq,
-                       struct skc_extent_atomic const * const extent);
-
-//
-//
-//
-
-
-struct skc_extent_dxrw *
-skc_extent_dxrw_alloc(struct skc_allocator     * const allocator,
-                      size_t                     const elem_size,
-                      skc_uint                   const elem_count,
-                      struct skc_extent_atomic * const atomic,
-                      size_t                     const atomic_offset);
-
-void
-skc_extent_dxrw_free(struct skc_allocator   * const allocator,
-                     struct skc_extent_dxrw * const extent);
-
-//
-//
-//
-
-struct skc_extent_hcrw *
-skc_extent_hcrw_alloc(struct skc_allocator * const allocator,
-                      size_t                 const elem_size,
-                      skc_uint               const elem_count);
-
-void
-skc_extent_hcrw_free(struct skc_allocator   * const allocator,
-                     struct skc_extent_hcrw * const extent);
-
-void
-skc_extent_hcrw_reset(struct skc_extent_hcrw * const extent);
-
-skc_bool
-skc_extent_hcrw_is_full(struct skc_extent_hcrw const * const extent);
-
-//
-//
-//
-
-struct skc_extent_hcw1_dr1 *
-skc_extent_hcw1_dr1_alloc(struct skc_allocator * const allocator,
-                          skc_uint               const elem_size,
-                          skc_uint               const elem_count);
-
-void
-skc_extent_hcw1_dr1_free(struct skc_allocator       * const allocator,
-                         struct skc_extent_hcw1_dr1 * const extent);
-
-void
-skc_extent_hcw1_dr1_map(struct skc_command_queue   * const cq,
-                        struct skc_extent_hcw1_dr1 * const extent);
-
-void
-skc_extent_hcw1_dr1_unmap(struct skc_command_queue   * const cq,
-                          struct skc_extent_hcw1_dr1 * const extent);
-
-void
-skc_extent_hcw1_dr1_reset(struct skc_extent_hcw1_dr1 * const extent);
-
-skc_bool
-skc_extent_hcw1_dr1_is_full(struct skc_extent_hcw1_dr1 const * const extent);
-
-skc_uint
-skc_extent_hcw1_dr1_rem(struct skc_extent_hcw1_dr1 * const extent);
-
-void
-skc_extent_hcw1_dr1_append(struct skc_extent_hcw1_dr1 * const extent,
-                           void  const * SKC_RESTRICT   const elem_ptr,
-                           skc_uint                     const elem_count_clamped);
-
-//
-// Note: on a shared memory device this reuses the hcw1_dr1
-// implementation and unmaps the extent instead of copying
-//
-
-struct skc_extent_hcw1_drN_unified *
-skc_extent_hcw1_drN_unified_alloc(struct skc_allocator * const allocator,
-                                  skc_uint               const elem_size,
-                                  skc_uint               const elem_count);
-
-void
-skc_extent_hcw1_drN_unified_free(struct skc_allocator               * const allocator,
-                                 struct skc_extent_hcw1_drN_unified * const extent);
-
-void
-skc_extent_hcw1_drN_unified_map(struct skc_command_queue           * const cq,
-                                struct skc_extent_hcw1_drN_unified * const extent);
-
-void
-skc_extent_hcw1_drN_unified_unmap(struct skc_command_queue           * const cq,
-                                  struct skc_extent_hcw1_drN_unified * const extent);
-
-void
-skc_extent_hcw1_drN_unified_reset(struct skc_extent_hcw1_drN_unified * const extent);
-
-skc_bool
-skc_extent_hcw1_drN_unified_is_full(struct skc_extent_hcw1_drN_unified const * const extent);
-
-skc_uint
-skc_extent_hcw1_drN_unified_rem(struct skc_extent_hcw1_drN_unified * const extent);
-
-void
-skc_extent_hcw1_drN_unified_append(struct skc_extent_hcw1_drN_unified * const extent,
-                                   void            const * SKC_RESTRICT const elem_ptr,
-                                   skc_uint                             const elem_count_clamped);
-//
-//
-//
-
-struct skc_id_pool_hp *
-skc_id_pool_hp_alloc(struct skc_allocator * const allocator,
-                     skc_uint               const count);
-
-void
-skc_id_pool_hp_free(struct skc_allocator  * const allocator,
-                    struct skc_id_pool_hp * const extent);
-
-void
-skc_id_pool_hp_acquire(struct skc_id_pool_hp * const extent, 
-                       skc_uint              * const id);
-
-void
-skc_id_pool_hp_release_1(struct skc_id_pool_hp * const extent, 
-                         skc_uint                const id);
-
-void
-skc_id_pool_hp_release_n(struct skc_id_pool_hp * const extent, 
-                         skc_uint        const * const id, 
-                         skc_uint                const count);
-
-//
-//
-//
-
-struct skc_block_pool_dprw *
-skc_block_pool_dprw_alloc(struct skc_allocator * const allocator,
-                          union skc_ring       * const ring_d,
-                          skc_uint               const block_size,
-                          skc_uint               const block_count);
-
-void
-skc_block_pool_dprw_free(struct skc_allocator       * const allocator,
-                         struct skc_block_pool_dprw * const extent);
-
-//
-//
-//
-
-struct skc_extent_hgw1_drN_unified *
-skc_extent_hgw1_drN_unified_alloc(struct skc_allocator * const allocator,
-                          skc_uint               const elem_size,
-                          skc_uint               const elem_count);
-
-void
-skc_extent_hgw1_drN_unified_free(struct skc_allocator       * const allocator,
-                         struct skc_extent_hgw1_drN_unified * const extent);
-
-void
-skc_extent_hgw1_drN_unified_reset(struct skc_extent_hgw1_drN_unified * const extent);
-
-void
-skc_extent_hgw1_drN_unified_snap(struct skc_command_queue         * const cq,
-                         struct skc_extent_hgw1_drN_unified const * const extent);
-
-//
-//
-//
-
-#if 0
-
-//
-//
-//
-
-struct skc_block_pool_hp_drw *
-skc_block_pool_hp_drw_alloc(struct skc_allocator * const allocator,
-                            skc_uint               const elem_size,
-                            skc_uint               const elem_count);
-
-void
-skc_block_pool_hp_drw_free(struct skc_allocator         * const allocator,
-                           struct skc_block_pool_hp_drw * const extent);
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
-
diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp b/src/compute/skc/cl_20/ring_cl_svm_fine.cpp
deleted file mode 100644
index 9552c81f2d..0000000000
--- a/src/compute/skc/cl_20/ring_cl_svm_fine.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// Fine-grained shared virtual memory ring
-//
-// There is limited support for C11 atomics in C compilers so
-// implement this module in C++11
-//
-
-extern "C" {
-
-#include "runtime.h"
-#include "ring_cl_svm_fine.h"
-
-}
-
-//
-//
-//
-
-#include <atomic>
-
-//
-//
-//
-
-union skc_ring
-{
-  std::atomic<skc_uint>   rw[2];
-  
-  struct {
-    std::atomic<skc_uint> reads;  // number of reads
-    std::atomic<skc_uint> writes; // number of writes
-  };
-};
-
-//
-//
-//
-
-union skc_ring *
-skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl)
-{
-  return (union skc_ring *)
-    clSVMAlloc(runtime_impl->context,
-               CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
-               sizeof(union skc_ring),
-               0);
-}
-
-void
-skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes)
-{
-  ring->reads  = ATOMIC_VAR_INIT(0);
-  ring->writes = ATOMIC_VAR_INIT(writes);
-}
-                    
-void
-skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring)
-{
-  clSVMFree(runtime_impl->context,ring);
-}
-
-//
-//
-//
-
-skc_uint
-skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n)
-{
-  return atomic_fetch_add_explicit(&ring->reads,n,std::memory_order_relaxed);
-}
-
-skc_uint
-skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n)
-{
-  return atomic_fetch_add_explicit(&ring->writes,n,std::memory_order_relaxed);
-}
-
-//
-//
-//
-
diff --git a/src/compute/skc/cl_20/ring_cl_svm_fine.h b/src/compute/skc/cl_20/ring_cl_svm_fine.h
deleted file mode 100644
index 65ff9f71f3..0000000000
--- a/src/compute/skc/cl_20/ring_cl_svm_fine.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// Fine-grained shared virtual memory ring
-//
-
-#include "runtime.h"
-#include "types.h"
-
-//
-//
-//
-
-union skc_ring *
-skc_ring_cl_svm_fine_alloc(struct skc_runtime_impl * const runtime_impl);
-
-void
-skc_ring_cl_svm_fine_free(struct skc_runtime_impl * const runtime_impl, union skc_ring * const ring);
-
-//
-//
-//
-
-void
-skc_ring_cl_svm_fine_init(union skc_ring * const ring, skc_uint writes);
-
-//
-//
-//
-
-skc_uint
-skc_ring_cl_svm_fine_read(union skc_ring * const ring, skc_uint const n);
-
-skc_uint
-skc_ring_cl_svm_fine_write(union skc_ring * const ring, skc_uint const n);
-
-//
-//
-//
-
diff --git a/src/compute/skc/common.h b/src/compute/skc/common.h
index 618ba2242e..5ac42ab2dc 100644
--- a/src/compute/skc/common.h
+++ b/src/compute/skc/common.h
@@ -9,6 +9,8 @@
 #ifndef SKC_COMMON_ONCE
 #define SKC_COMMON_ONCE
 
+#include "types.h"
+
 //
 // structures common to both host and device -- placeholder until
 // everything shakes out
diff --git a/src/compute/skc/composition_cl_12.c b/src/compute/skc/composition_cl_12.c
deleted file mode 100644
index 7853564636..0000000000
--- a/src/compute/skc/composition_cl_12.c
+++ /dev/null
@@ -1,823 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "hs/cl/hs_cl_launcher.h"
-
-#include "common/cl/assert_cl.h"
-
-#include "composition_cl_12.h"
-#include "config_cl.h"
-
-#include "context.h"
-#include "raster.h"
-#include "handle.h"
-
-#include "runtime_cl_12.h"
-
-#include "common.h"
-#include "tile.h"
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   18  |  7  |  7  |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   15  |  9  |  8  |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          27          |    1   |    1   |   18  |  9  |  8  |
-//
-
-union skc_ttck
-{
-  skc_ulong   u64;
-  skc_uint2   u32v2;
-
-  struct {
-    skc_uint  id         : SKC_TTCK_LO_BITS_ID;
-    skc_uint  prefix     : SKC_TTCK_LO_BITS_PREFIX;
-    skc_uint  escape     : SKC_TTCK_LO_BITS_ESCAPE;
-    skc_uint  layer_lo   : SKC_TTCK_LO_BITS_LAYER;
-    skc_uint  layer_hi   : SKC_TTCK_HI_BITS_LAYER;
-    skc_uint  x          : SKC_TTCK_HI_BITS_X;
-    skc_uint  y          : SKC_TTCK_HI_BITS_Y;
-  };
-
-  struct {
-    skc_ulong na0        : SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE;
-    skc_ulong layer      : SKC_TTCK_BITS_LAYER;
-    skc_ulong na1        : SKC_TTCK_HI_BITS_YX;
-  };
-
-  struct {
-    skc_uint  na2;
-    skc_uint  na3        : SKC_TTCK_HI_BITS_LAYER;
-    skc_uint  yx         : SKC_TTCK_HI_BITS_YX;
-  };
-};
-
-//
-// FIXME -- accept floats on host but convert to subpixel offsets
-// before appending to command ring
-//
-
-#define SKC_PLACE_CMD_TX_CONVERT(f)  0
-#define SKC_PLACE_CMD_TY_CONVERT(f)  0
-
-//
-// COMPOSITION PLACE
-//
-// This is a snapshot of the host-side command queue.
-//
-// Note that the composition command extent could be implemented as
-// either a mapped buffer or simply copied to an ephemeral extent.
-//
-// This implementation may vary between compute platforms.
-//
-
-struct skc_composition_place
-{
-  struct skc_composition_impl      * impl;
-
-  cl_command_queue                   cq;
-
-  struct skc_extent_phw1g_tdrNs_snap cmds;
-
-  skc_subbuf_id_t                    id;
-};
-
-//
-// Forward declarations
-//
-
-static
-void
-skc_composition_unseal_block(struct skc_composition_impl * const impl, 
-                             skc_bool                      const block);
-
-//
-//
-//
-
-static
-void
-skc_composition_pfn_release(struct skc_composition_impl * const impl)
-{
-  if (--impl->composition->ref_count != 0)
-    return;
-
-  //
-  // otherwise, dispose of all resources
-  //
-
-  // the unsealed state is a safe state to dispose of resources
-  skc_composition_unseal_block(impl,true); // block
-
-  struct skc_runtime * const runtime = impl->runtime;
-  
-  // free host composition
-  skc_runtime_host_perm_free(runtime,impl->composition);
-
-  // release the cq
-  skc_runtime_release_cq_in_order(runtime,impl->cq);
-
-  // release kernels
-  cl(ReleaseKernel(impl->kernels.place));
-  cl(ReleaseKernel(impl->kernels.segment));  
-
-  // release extents
-  skc_extent_phw1g_tdrNs_free(runtime,&impl->cmds.extent);
-  skc_extent_phrw_free       (runtime,&impl->saved.extent);
-  skc_extent_phr_pdrw_free   (runtime,&impl->atomics);
-
-  skc_extent_pdrw_free       (runtime,&impl->keys);
-  skc_extent_pdrw_free       (runtime,&impl->offsets);
-      
-  // free composition impl
-  skc_runtime_host_perm_free(runtime,impl);
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_place_grid_pfn_dispose(skc_grid_t const grid)
-{
-  struct skc_composition_place * const place   = skc_grid_get_data(grid);
-  struct skc_composition_impl  * const impl    = place->impl;
-  struct skc_runtime           * const runtime = impl->runtime;
-
-  // release cq
-  skc_runtime_release_cq_in_order(runtime,place->cq);
-
-  // unmap the snapshot (could be a copy)
-  skc_extent_phw1g_tdrNs_snap_free(runtime,&place->cmds);
-
-  // release place struct
-  skc_runtime_host_temp_free(runtime,place,place->id);
-
-  // release impl
-  skc_composition_pfn_release(impl);
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_place_read_complete(skc_grid_t const grid)
-{
-  skc_grid_complete(grid);
-}
-
-static
-void
-skc_composition_place_read_cb(cl_event event, cl_int status, skc_grid_t const grid)
-{
-  SKC_CL_CB(status);
-  
-  struct skc_composition_place * const place     = skc_grid_get_data(grid);
-  struct skc_composition_impl  * const impl      = place->impl;
-  struct skc_runtime           * const runtime   = impl->runtime;
-  struct skc_scheduler         * const scheduler = runtime->scheduler;
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(scheduler,skc_composition_place_read_complete,grid);
-}
-
-static
-void
-skc_composition_place_grid_pfn_execute(skc_grid_t const grid)
-{
-  //
-  // FILLS EXPAND
-  //
-  // need result of cmd counts before launching RASTERIZE grids
-  //
-  // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host
-  // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device
-  // - or launch a device-wide grid that feeds itself but that's unsatisfying
-  //
-  struct skc_composition_place * const place   = skc_grid_get_data(grid);
-  struct skc_composition_impl  * const impl    = place->impl;
-  struct skc_runtime           * const runtime = impl->runtime;
-
-  skc_uint  const work_size = skc_extent_ring_snap_count(place->cmds.snap);
-  skc_uint4 const clip      = { 0, 0, SKC_UINT_MAX, SKC_UINT_MAX };
-
-  // initialize kernel args
-  cl(SetKernelArg(impl->kernels.place,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
-  cl(SetKernelArg(impl->kernels.place,1,SKC_CL_ARG(impl->atomics.drw)));
-  cl(SetKernelArg(impl->kernels.place,2,SKC_CL_ARG(impl->keys.drw)));
-  cl(SetKernelArg(impl->kernels.place,3,SKC_CL_ARG(place->cmds.drN)));
-  cl(SetKernelArg(impl->kernels.place,4,SKC_CL_ARG(runtime->handle_pool.map.drw)));
-  cl(SetKernelArg(impl->kernels.place,5,SKC_CL_ARG(clip))); // FIXME -- convert the clip to yx0/yx1 format
-  cl(SetKernelArg(impl->kernels.place,6,SKC_CL_ARG(work_size)));
-
-  // launch kernel
-  skc_device_enqueue_kernel(runtime->device,
-                            SKC_DEVICE_KERNEL_ID_PLACE,
-                            place->cq,
-                            impl->kernels.place,
-                            work_size,
-                            0,NULL,NULL);
-  //
-  // copy atomics back after every place launch
-  //
-  cl_event complete;
-
-  skc_extent_phr_pdrw_read(&impl->atomics,place->cq,&complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_place_read_cb,grid));
-  cl(ReleaseEvent(complete));
-
-  // flush command queue
-  cl(Flush(place->cq));
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_snap(struct skc_composition_impl * const impl)
-{
-  skc_composition_retain(impl->composition);
-
-  skc_subbuf_id_t id;
-
-  struct skc_composition_place * const place = skc_runtime_host_temp_alloc(impl->runtime,
-                                                                           SKC_MEM_FLAGS_READ_WRITE,
-                                                                           sizeof(*place),&id,NULL);
-
-  // save the subbuf id
-  place->id = id;
-
-  // save backpointer
-  place->impl = impl;
-
-  // set grid data
-  skc_grid_set_data(impl->grids.place,place);
-  
-  // acquire command queue
-  place->cq = skc_runtime_acquire_cq_in_order(impl->runtime);
-
-  // checkpoint the ring
-  skc_extent_ring_checkpoint(&impl->cmds.ring);
-
-  // make a snapshot
-  skc_extent_phw1g_tdrNs_snap_init(impl->runtime,&impl->cmds.ring,&place->cmds);
-  
-  // unmap the snapshot (could be a copy)
-  skc_extent_phw1g_tdrNs_snap_alloc(impl->runtime,
-                                    &impl->cmds.extent,
-                                    &place->cmds,
-                                    place->cq,
-                                    NULL);
-
-  skc_grid_force(impl->grids.place);
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_pfn_seal(struct skc_composition_impl * const impl)
-{
-  // return if sealing or sealed
-  if (impl->state >= SKC_COMPOSITION_STATE_SEALING)
-    return;
-
-  struct skc_runtime   * const runtime   = impl->runtime;
-  struct skc_scheduler * const scheduler = runtime->scheduler;
-
-  //
-  // otherwise, wait for UNSEALING > UNSEALED transition
-  //
-  if (impl->state == SKC_COMPOSITION_STATE_UNSEALING)
-    {
-      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED);
-    }
-  else // or we were already unsealed
-    {
-      // flush is there is work in progress
-      skc_uint const count = skc_extent_ring_wip_count(&impl->cmds.ring);
-
-      if (count > 0) {
-        skc_composition_snap(impl);
-      }
-    }
-
-  //
-  // now unsealed so we need to start sealing...
-  //
-  impl->state = SKC_COMPOSITION_STATE_SEALING;
-
-  //
-  // the seal operation implies we should force start all dependencies
-  // that are still in a ready state
-  //
-  skc_grid_force(impl->grids.sort);
-}
-
-//
-//
-//
-
-void
-skc_composition_sort_execute_complete(struct skc_composition_impl * const impl)
-{
-  // we're sealed
-  impl->state = SKC_COMPOSITION_STATE_SEALED;
-
-  // this grid is done
-  skc_grid_complete(impl->grids.sort);
-}
-
-static
-void
-skc_composition_sort_execute_cb(cl_event event, cl_int status, struct skc_composition_impl * const impl)
-{
-  SKC_CL_CB(status);
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_composition_sort_execute_complete,impl);
-}
-
-static
-void
-skc_composition_sort_grid_pfn_execute(skc_grid_t const grid)
-{
-  struct skc_composition_impl * const impl = skc_grid_get_data(grid);
-
-  // we should be sealing 
-  assert(impl->state == SKC_COMPOSITION_STATE_SEALING);
-
-  struct skc_place_atomics * const atomics = impl->atomics.hr;
-
-#ifndef NDEBUG
-  fprintf(stderr,"composition sort: %u\n",atomics->keys);
-#endif
-
-  if (atomics->keys > 0)
-    {
-      uint32_t keys_padded_in, keys_padded_out;
-
-      hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
-
-      hs_sort(impl->cq,
-              impl->keys.drw,
-              impl->keys.drw,
-              atomics->keys,
-              keys_padded_in,
-              keys_padded_out,
-              false);
-
-      cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw)));
-      cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw)));
-      cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw)));
-
-      // find start of each tile
-      skc_device_enqueue_kernel(impl->runtime->device,
-                                SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK,
-                                impl->cq,
-                                impl->kernels.segment,
-                                atomics->keys,
-                                0,NULL,NULL);
-    }
-
-  cl_event complete;
-
-  // next stage needs to know number of key segments
-  skc_extent_phr_pdrw_read(&impl->atomics,impl->cq,&complete);
-  
-  // register a callback
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_sort_execute_cb,impl));
-  cl(ReleaseEvent(complete));
-
-  // flush cq
-  cl(Flush(impl->cq));
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_raster_release(struct skc_composition_impl * const impl)
-{
-  //
-  // reference counts to rasters can only be released when the
-  // composition is unsealed and the atomics are reset.
-  //
-  skc_runtime_raster_device_release(impl->runtime,
-                                    impl->saved.extent.hrw,
-                                    impl->saved.count);
-  // reset count
-  impl->saved.count = 0;
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_unseal_block(struct skc_composition_impl * const impl, 
-                             skc_bool                      const block)
-{
-  // return if already unsealed
-  if (impl->state == SKC_COMPOSITION_STATE_UNSEALED)
-    return;
-
-  //
-  // otherwise, we're going to need to pump the scheduler
-  //
-  struct skc_scheduler * const scheduler = impl->runtime->scheduler;
-
-  //
-  // wait for UNSEALING > UNSEALED transition
-  //
-  if (impl->state == SKC_COMPOSITION_STATE_UNSEALING)
-    {
-      if (block) {
-        SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED);
-      }
-      return;
-    }
-
-  //
-  // wait for SEALING > SEALED transition ...
-  //
-  if (impl->state == SKC_COMPOSITION_STATE_SEALING)
-    {
-      // wait if sealing
-      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_SEALED);
-    }
-
-  // wait for rendering locks to be released
-  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0);
-
-  //
-  // no need to visit UNSEALING state with this implementation
-  //
-
-  // acquire a new grid
-  impl->grids.sort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
-                                          NULL,  // the composition state guards this
-                                          impl,
-                                          NULL,  // no waiting
-                                          skc_composition_sort_grid_pfn_execute,
-                                          NULL); // no dispose
-
-  // mark composition as unsealed
-  impl->state = SKC_COMPOSITION_STATE_UNSEALED;
-}
-
-//
-// can only be called on a composition that was just unsealed
-//
-static
-void
-skc_composition_reset(struct skc_composition_impl * const impl)
-{
-  // zero the atomics
-  skc_extent_phr_pdrw_zero(&impl->atomics,impl->cq,NULL);
-
-  // flush it
-  cl(Flush(impl->cq));
-
-  // release all the rasters
-  skc_composition_raster_release(impl);
-}
-
-static
-void
-skc_composition_unseal_block_reset(struct skc_composition_impl * const impl, 
-                                   skc_bool                      const block,
-                                   skc_bool                      const reset)
-{
-  skc_composition_unseal_block(impl,block);
-
-  if (reset) {
-    skc_composition_reset(impl);
-  }
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_pfn_unseal(struct skc_composition_impl * const impl, skc_bool const reset)
-{
-  skc_composition_unseal_block_reset(impl,false,reset);
-}
-
-//
-// only needs to create a grid
-//
-
-static
-void
-skc_composition_place_create(struct skc_composition_impl * const impl)
-{
-  // acquire a grid
-  impl->grids.place = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
-                                           &impl->grids.place,
-                                           NULL,
-                                           NULL, // no waiting
-                                           skc_composition_place_grid_pfn_execute,
-                                           skc_composition_place_grid_pfn_dispose);
-
-  // assign happens-after relationship
-  skc_grid_happens_after_grid(impl->grids.sort,impl->grids.place);
-}
-
-
-static
-skc_err
-skc_composition_pfn_place(struct skc_composition_impl * const impl,
-                          skc_raster_t          const *       rasters,
-                          skc_layer_id          const *       layer_ids,
-                          skc_float             const *       txs,
-                          skc_float             const *       tys,
-                          skc_uint                            count)
-{
-  // block and yield if not unsealed
-  skc_composition_unseal_block(impl,true);
-
-  //
-  // validate and retain all rasters
-  //
-  skc_err err;
-
-  err = skc_runtime_handle_device_validate_retain(impl->runtime,
-                                                  SKC_TYPED_HANDLE_TYPE_IS_RASTER,
-                                                  rasters,
-                                                  count);
-  if (err) 
-    return err;
-
-  skc_runtime_handle_device_retain(impl->runtime,rasters,count);
-
-  //
-  // save the stripped handles
-  //
-  skc_raster_t * saved = impl->saved.extent.hrw;
-
-  saved             += impl->saved.count;
-  impl->saved.count += count;
-
-  for (skc_uint ii=0; ii<count; ii++) {
-    saved[ii] = SKC_TYPED_HANDLE_TO_HANDLE(*rasters++);
-  }
-
-  //
-  // - declare the place grid happens after the raster
-  // - copy place commands into ring
-  //
-  do {
-    skc_uint rem;
-
-    // find out how much room is left in then ring's snap    
-    // if the place ring is full -- let it drain
-    SKC_SCHEDULER_WAIT_WHILE(impl->runtime->scheduler,(rem = skc_extent_ring_wip_rem(&impl->cmds.ring)) == 0);
-
-    // append commands
-    skc_uint avail = min(rem,count);
-
-    // decrement count
-    count -= avail;
-
-    // launch a place kernel after copying commands?
-    skc_bool const is_wip_full = (avail == rem);
-
-    // if there is no place grid then create one
-    if (impl->grids.place == NULL)
-      {
-        skc_composition_place_create(impl);
-      }
-
-    //
-    // FIXME -- OPTIMIZATION? -- the ring_wip_index_inc() test can
-    // be avoided by splitting into at most two intervals. It should
-    // be plenty fast as is though so leave for now.
-    //
-    union skc_cmd_place * const cmds = impl->cmds.extent.hw1;
-
-    if ((txs == NULL) && (tys == NULL))
-      {
-        while (avail-- > 0)
-          {
-            skc_raster_t const raster = *saved++;
-
-            skc_grid_happens_after_handle(impl->grids.place,raster);
-
-            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
-              (union skc_cmd_place){ raster, *layer_ids++, 0, 0 };
-          }
-      }
-    else if (txs == NULL)
-      {
-        while (avail-- > 0)
-          {
-            skc_raster_t const raster = *saved++;
-
-            skc_grid_happens_after_handle(impl->grids.place,raster);
-
-            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
-              (union skc_cmd_place){ raster,
-                                     *layer_ids++,
-                                     0,
-                                     SKC_PLACE_CMD_TY_CONVERT(*tys++) };
-          }
-      }
-    else if (tys == NULL)
-      {
-        while (avail-- > 0)
-          {
-            skc_raster_t const raster = *saved++;
-
-            skc_grid_happens_after_handle(impl->grids.place,raster);
-
-            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
-              (union skc_cmd_place){ raster,
-                                     *layer_ids++,
-                                     SKC_PLACE_CMD_TX_CONVERT(*txs++),
-                                     0 };
-          }
-      }
-    else
-      {
-        while (avail-- > 0)
-          {
-            skc_raster_t const raster = *saved++;
-
-            skc_grid_happens_after_handle(impl->grids.place,raster);
-
-            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
-              (union skc_cmd_place){ raster,
-                                     *layer_ids++,
-                                     SKC_PLACE_CMD_TX_CONVERT(*txs++),
-                                     SKC_PLACE_CMD_TY_CONVERT(*tys++) };
-          }
-      }
-
-    // launch place kernel?
-    if (is_wip_full) {
-      skc_composition_snap(impl);
-    }
-  } while (count > 0);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-static
-void
-skc_composition_pfn_bounds(struct skc_composition_impl * const impl, skc_int bounds[4])
-{
-  //
-  // FIXME -- not implemented yet
-  //
-  // impl bounds will be copied back after sealing
-  //
-  bounds[0] = SKC_INT_MIN;
-  bounds[1] = SKC_INT_MIN;
-  bounds[2] = SKC_INT_MAX;
-  bounds[3] = SKC_INT_MAX;
-}
-
-//
-//
-//
-
-void
-skc_composition_retain_and_lock(struct skc_composition * const composition)
-{
-  skc_composition_retain(composition);
-
-  composition->impl->lock_count += 1;
-}
-
-void
-skc_composition_unlock_and_release(struct skc_composition * const composition)
-{
-  composition->impl->lock_count -= 1;
-
-  skc_composition_pfn_release(composition->impl);
-}
-
-//
-//
-//
-
-skc_err
-skc_composition_cl_12_create(struct skc_context       * const context,
-                             struct skc_composition * * const composition)
-{
-  struct skc_runtime * const runtime = context->runtime;
-
-  // retain the context
-  // skc_context_retain(context);
-
-  // allocate impl
-  struct skc_composition_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
-
-  // allocate composition
-  (*composition)            = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**composition));
-
-  (*composition)->context   = context;
-  (*composition)->impl      = impl;
-  (*composition)->ref_count = 1;
-
-  (*composition)->place     = skc_composition_pfn_place;
-  (*composition)->unseal    = skc_composition_pfn_unseal;
-  (*composition)->seal      = skc_composition_pfn_seal;
-  (*composition)->bounds    = skc_composition_pfn_bounds;
-  (*composition)->release   = skc_composition_pfn_release;
-
-  // intialize impl
-  impl->composition   = (*composition);
-  impl->runtime       = runtime;
-
-  SKC_ASSERT_STATE_INIT(impl,SKC_COMPOSITION_STATE_SEALED);
-
-  impl->lock_count    = 0;
-
-  impl->grids.sort    = NULL;
-  impl->grids.place   = NULL;
-
-  // acquire command queue for sealing/unsealing
-  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
-
-  // acquire kernels
-  impl->kernels.place   = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PLACE);
-  impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK);
-
-  // get config
-  struct skc_config const * const config = runtime->config;
-
-  // initialize ring size with config values
-  skc_extent_ring_init(&impl->cmds.ring,
-                       config->composition.cmds.elem_count,
-                       config->composition.cmds.snap_count,
-                       sizeof(union skc_cmd_place));
-
-  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->cmds.extent ,sizeof(union skc_cmd_place) * config->composition.cmds.elem_count);
-  skc_extent_phrw_alloc       (runtime,&impl->saved.extent,sizeof(skc_raster_t)        * config->composition.raster_ids.elem_count);
-  skc_extent_phr_pdrw_alloc   (runtime,&impl->atomics     ,sizeof(struct skc_place_atomics));
-
-  skc_extent_pdrw_alloc       (runtime,&impl->keys        ,sizeof(skc_ttxk_t)          * config->composition.keys.elem_count);
-  skc_extent_pdrw_alloc       (runtime,&impl->offsets     ,sizeof(skc_uint)            * (1u << SKC_TTCK_HI_BITS_YX)); // 1MB
-
-  // nothing saved
-  impl->saved.count = 0;
-
-  // unseal the composition, zero the atomics, etc.
-  skc_composition_unseal_block_reset(impl,false,true);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/composition_cl_12.h b/src/compute/skc/composition_cl_12.h
deleted file mode 100644
index 4f52090658..0000000000
--- a/src/compute/skc/composition_cl_12.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include <CL/opencl.h>
-
-#include "composition.h"
-#include "assert_state.h"
-#include "grid.h"
-#include "extent_cl_12.h"
-#include "extent_ring.h"
-
-//
-// composition states
-//
-
-typedef enum skc_composition_state_e {
-
-  SKC_COMPOSITION_STATE_UNSEALING,
-  SKC_COMPOSITION_STATE_UNSEALED,
-  SKC_COMPOSITION_STATE_SEALING,
-  SKC_COMPOSITION_STATE_SEALED
-
-} skc_composition_state_e;
-
-//
-// IMPL
-//
-
-struct skc_composition_impl
-{  
-  struct skc_composition        * composition;
-  struct skc_runtime            * runtime;
-  
-  SKC_ASSERT_STATE_DECLARE(skc_composition_state_e);
-
-  skc_int                         lock_count; // wip renders
-
-  struct { 
-    skc_grid_t                    sort;
-    skc_grid_t                    place;
-  } grids;
-
-  cl_command_queue                cq;
-
-  struct {
-    cl_kernel                     place;
-    cl_kernel                     segment;
-  } kernels;
-
-  // raster ids must be held until the composition is reset or
-  // released and then their refcounts can be decremented
-  struct {
-    struct skc_extent_phrw        extent;
-    skc_uint                      count;
-  } saved;
-
-  struct {
-    struct skc_extent_ring        ring;   // how many slots left?
-    struct skc_extent_phw1g_tdrNs extent; // wip command extent
-  } cmds;
-
-  // composition extent length
-  struct skc_extent_phr_pdrw      atomics;
-
-  // composition ttck extent
-  struct skc_extent_pdrw          keys;
-
-  // key offsets in sealed and sorted ttck extent
-  struct skc_extent_pdrw          offsets;
-};
-
-//
-// ATOMICS
-//
-
-struct skc_place_atomics
-{
-  skc_uint keys;
-  skc_uint offsets;
-};
-
-//
-// ONLY VISIBLE WITHIN THIS RUNTIME
-//
-
-void
-skc_composition_retain_and_lock(struct skc_composition * const composition);
-
-void
-skc_composition_unlock_and_release(struct skc_composition * const composition);
-
-//
-//
-//
diff --git a/src/compute/skc/config_cl.h b/src/compute/skc/config_cl.h
deleted file mode 100644
index 0172857b07..0000000000
--- a/src/compute/skc/config_cl.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "runtime_cl.h"
-#include "block_pool_cl.h"
-
-//
-// FIXME -- define individual structs before defining skc_config
-//
-
-struct skc_config
-{
-  struct {
-    struct {
-      skc_uint               size;
-      skc_uint               subbufs;
-    } host;   // alignment determined by compiler
-    struct {
-      skc_uint               size;
-      skc_uint               subbufs; 
-    } device; // alignment determined by device
-  } suballocator;
-  
-  struct {
-    skc_uint                 size;
-  } scheduler;
-
-  struct {
-    skc_uint                 bytes;    // bytes per subblock -- pow2
-    skc_uint                 words;    // words per subblock -- pow2
-    // skc_uint              words_log2;
-  } subblock;
-
-  struct {
-    skc_uint                 bytes;     // bytes per block     -- pow2
-    skc_uint                 words;     // words per block     -- pow2
-    skc_uint                 subblocks; // subblocks per block -- block.bytes >= subblock.bytes
-    // skc_uint              subblocks_log2;
-  } block;
-
-  union skc_block_pool_size  block_pool;
-
-  struct {
-    skc_cq_type_e            type;
-    skc_uint                 size;
-  } cq_pool;
-
-  struct {
-    skc_uint                 size;      // a large fraction of block pool size
-    skc_uint                 width;     // determines number of launched reclamation subgroups
-    skc_uint                 recs;      // how many in-flight width-subgroup reclamation grids 
-  } handle_pool;
-
-  struct {
-    skc_uint                 width;     // tile width  in pixels
-    skc_uint                 height;    // tile height in pixels
-    skc_uint                 ratio;     // subblocks per TTPB
-  } tile;
-
-  struct {
-    struct {
-      skc_uint               count;     // # of subbufs in buffer
-    } buffer;
-
-    struct {
-      skc_uint               count;     // # of blocks/commands in subbuf
-    } subbuf;
-
-    struct {
-      size_t                 buffer;    // block.bytes * subbuf.blocks * subbuf.count
-      size_t                 subbuf;    // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
-    } block;
-
-    struct {
-      size_t                 buffer;    // sizeof(skc_uint) * subbuf.blocks * subbuf.count
-      size_t                 subbuf;    // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
-    } command;
-    //
-    // skc_uint              paths_lowat;
-    //
-  } paths_copy;
-
-  struct {
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } path_ids;
-
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } transforms;
-
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } clips;
-
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } fill;
-
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } raster_ids;
-
-    struct {
-      skc_uint               cmds;
-    } expand;
-
-    struct {
-      skc_uint               keys;
-    } rasterize;
-  } raster_cohort;
-
-  struct {
-    struct {
-      skc_uint               elem_count;
-      skc_uint               snap_count;
-    } cmds;
-
-    struct {
-      skc_uint               elem_count;
-    } raster_ids;
-
-    struct {
-      skc_uint               elem_count;
-    } keys;
-  } composition;
-};
-
-//
-//
-//
diff --git a/src/compute/skc/cq_pool_cl.c b/src/compute/skc/cq_pool_cl.c
deleted file mode 100644
index 80cfe34cf8..0000000000
--- a/src/compute/skc/cq_pool_cl.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#ifndef NDEBUG
-#include <stdio.h>
-#endif
-
-//
-//
-//
-
-#include <string.h>
-
-//
-//
-//
-
-#include "runtime_cl_12.h"
-
-//
-// This implementation is probably excessive.
-//
-// The command queue pool could easily be replaced with simply an LRU
-// or even round-robin reuse pool.  Even a small number of aliased
-// command queues can probably enough concurrency.
-//
-
-#define SKC_CQ_POOL_EXPAND 1
-
-//
-//
-//
-
-void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size)
-{
-  pool->type   = type;
-  pool->size   = size + 1; // an empty spot
-  pool->reads  = 0;
-  pool->writes = size;
-  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
-
-  for (skc_uint ii=0; ii<size; ii++) {
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
-  }
-  pool->cq[size] = NULL;
-}
-
-//
-//
-//
-
-void
-skc_cq_pool_dispose(struct skc_runtime * const runtime,
-                    struct skc_cq_pool *       pool)
-{
-  //
-  // FIXME -- release the command queues after waiting for the ring to
-  // be full with pool.size queues?
-  //
-  skc_runtime_host_perm_free(runtime,pool->cq);
-}
-
-//
-//
-//
-
-static 
-void
-skc_cq_pool_write(struct skc_cq_pool * const pool,
-                  cl_command_queue           cq)
-{
-  pool->cq[pool->writes++ % pool->size] = cq;
-}
-
-//
-// only expand when completely empty
-//
-
-static
-void
-skc_cq_pool_expand(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint                   expand)
-{
-#ifndef NDEBUG
-  fprintf(stderr,"Expanding the cq_pool by: %u (%u)\n",expand,pool->size);
-#endif
-
-  // free old
-  skc_runtime_host_perm_free(runtime,pool->cq);
-
-  // the ring is empty
-  pool->size  += expand;
-  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
-  pool->reads  = 0;
-  pool->writes = expand;
-
-  for (skc_uint ii=0; ii<expand; ii++)
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
-}
-
-//
-//
-//
-
-static 
-cl_command_queue
-skc_cq_pool_read(struct skc_runtime * const runtime,
-                 struct skc_cq_pool * const pool)
-{
-  // any command queues left?
-  if (pool->reads == pool->writes)
-    skc_cq_pool_expand(runtime,pool,SKC_CQ_POOL_EXPAND);
-
-  cl_command_queue cq = pool->cq[pool->reads++ % pool->size];
-
-  return cq;
-}
-
-//
-//
-//
-
-cl_command_queue
-skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime)
-{
-  return skc_cq_pool_read(runtime,&runtime->cq_pool);
-}
-
-void
-skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, 
-                                cl_command_queue           cq)
-{
-  skc_cq_pool_write(&runtime->cq_pool,cq);
-}
-
-//
-//
-//
diff --git a/src/compute/skc/cq_pool_cl.h b/src/compute/skc/cq_pool_cl.h
deleted file mode 100644
index 0cc73a2f82..0000000000
--- a/src/compute/skc/cq_pool_cl.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-#include "types.h"
-
-//
-// Why we need to wrap command queue creation:
-//
-// - command queue creation is expensive
-// 
-// - the CL 1.2 function is deprecated in 2.0
-//
-
-struct skc_cq_pool
-{
-  skc_cq_type_e      type;
-  skc_uint           size;
-  skc_uint           reads;
-  skc_uint           writes;
-  cl_command_queue * cq;
-};
-
-//l
-//
-//
-
-void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size);
-
-void
-skc_cq_pool_dispose(struct skc_runtime * const runtime,
-                    struct skc_cq_pool *       pool);
-
-//
-//
-//
diff --git a/src/compute/skc/device_cl_12.h b/src/compute/skc/device_cl_12.h
deleted file mode 100644
index 637b61ae10..0000000000
--- a/src/compute/skc/device_cl_12.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include <CL/opencl.h>
-
-//
-//
-//
-
-#define SKC_CL_ARG(arg) sizeof(arg),&arg
-
-//
-//
-//
-
-typedef enum skc_device_kernel_id {
-  SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS,
-  SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS,
-
-  SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
-  SKC_DEVICE_KERNEL_ID_PATHS_COPY,
-
-  SKC_DEVICE_KERNEL_ID_FILLS_EXPAND,
-
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL,  
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES,
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS,
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS,
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS,
-  SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS,
-
-  SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK,
-  SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC,
-
-  SKC_DEVICE_KERNEL_ID_PREFIX,
-  SKC_DEVICE_KERNEL_ID_PLACE,
-  SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK,
-
-  SKC_DEVICE_KERNEL_ID_RENDER,
-
-  SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM,
-  SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM,
-
-  //
-  SKC_DEVICE_KERNEL_ID_COUNT
-
-} skc_device_kernel_id;
-
-//
-//
-//
-
-void
-skc_device_create(struct skc_runtime * const runtime);
-
-
-void
-skc_device_dispose(struct skc_runtime * const runtime);
-
-
-// 
-// multi-threading/context/device requires multiple kernel instances
-//
-
-cl_kernel
-skc_device_acquire_kernel(struct skc_device  * const device, 
-                          skc_device_kernel_id const type);
-
-//
-// grid shape can vary greatly by target platform
-//
-void
-skc_device_enqueue_kernel(struct skc_device  * const device, 
-                          skc_device_kernel_id const type,
-                          cl_command_queue           cq,
-                          cl_kernel                  kernel,
-                          size_t               const work_size,                          
-                          cl_uint                    num_events_in_wait_list,
-                          cl_event const     * const event_wait_list, 
-                          cl_event           * const event);
-
-//
-//
-//
diff --git a/src/compute/skc/device_cl_12_avx2.h b/src/compute/skc/device_cl_12_avx2.h
deleted file mode 100644
index e68579c0f7..0000000000
--- a/src/compute/skc/device_cl_12_avx2.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_ONCE_DEVICE_CL_12_AVX2_H
-#define SKC_ONCE_DEVICE_CL_12_AVX2_H
-
-//
-//
-//
-
-#define SKC_DEVICE_BLOCK_WORDS_LOG2         6
-#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2      4
-
-//
-//
-//
-
-#define SKC_DEVICE_BLOCK_WORDS              (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
-#define SKC_DEVICE_SUBBLOCK_WORDS           (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-//
-//
-//
-
-#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK      (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)
-
-//
-//
-//
-
-#define SKC_COPY_PATHS_THREADS_PER_BLOCK    SKC_DEVICE_SUBBLOCK_WORDS
-#define SKC_COPY_PATHS_ELEM_WORDS           1
-
-//
-//
-//
-
-#define SKC_EXPAND_FILLS_THREADS_PER_BLOCK  SKC_DEVICE_SUBBLOCK_WORDS
-#define SKC_EXPAND_FILLS_ELEM_WORDS         1
-
-//
-//
-//
-
-#define SKC_RASTERIZE_THREADS_PER_BLOCK     SKC_DEVICE_SUBBLOCK_WORDS
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/device_cl_12_gen9.c b/src/compute/skc/device_cl_12_gen9.c
deleted file mode 100644
index 5b4d9d2dd2..0000000000
--- a/src/compute/skc/device_cl_12_gen9.c
+++ /dev/null
@@ -1,942 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include "common/cl/assert_cl.h"
-#include "macros.h"
-
-#include "config_cl.h"
-#include "runtime_cl_12.h"
-
-#include "raster.h"
-#include "tile.h"
-
-#include "hs/cl/hs_cl_launcher.h"
-#include "hs/cl/gen9/hs_cl.h"
-
-//
-//
-//
-
-#define SKC_KERNEL_SPIRV  0
-#define SKC_KERNEL_BINARY 1
-#define SKC_KERNEL_SRC    0
-
-//
-//
-//
-
-#if   SKC_KERNEL_SPIRV
-
-#include "block_pool_init.pre.spv.inl"
-#include "paths_copy.pre.spv.inl"
-#include "fills_expand.pre.spv.inl"
-#include "rasterize.pre.spv.inl"
-#include "segment_ttrk.pre.spv.inl"
-#include "rasters_alloc.pre.spv.inl"
-#include "prefix.pre.spv.inl"
-#include "place.pre.spv.inl"
-#include "segment_ttck.pre.spv.inl"
-#include "render.pre.spv.inl"
-#include "paths_reclaim.pre.spv.inl"
-#include "rasters_reclaim.pre.spv.inl"
-
-#elif SKC_KERNEL_BINARY
-
-#include "block_pool_init.pre.bin.inl"
-#include "paths_copy.pre.bin.inl"
-#include "fills_expand.pre.bin.inl"
-#include "rasterize.pre.bin.inl"
-#include "segment_ttrk.pre.bin.inl"
-#include "rasters_alloc.pre.bin.inl"
-#include "prefix.pre.bin.inl"
-#include "place.pre.bin.inl"
-#include "segment_ttck.pre.bin.inl"
-#include "render.pre.bin.inl"
-#include "paths_reclaim.pre.bin.inl"
-#include "rasters_reclaim.pre.bin.inl"
-
-#elif SKC_KERNEL_SRC
-
-#include "block_pool_init.pre.src.inl"
-#include "paths_copy.pre.src.inl"
-#include "fills_expand.pre.src.inl"
-#include "rasterize.pre.src.inl"
-#include "segment_ttrk.pre.src.inl"
-#include "rasters_alloc.pre.src.inl"
-#include "prefix.pre.src.inl"
-#include "place.pre.src.inl"
-#include "segment_ttck.pre.src.inl"
-#include "render.pre.src.inl"
-#include "paths_reclaim.pre.src.inl"
-#include "rasters_reclaim.pre.src.inl"
-
-#endif
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-
-//
-// FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY
-//
-
-static 
-struct skc_config const config =
-  {
-    .suballocator = {
-      .host = {
-        .size       = 1024 * 1024, // words
-        .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
-      },
-      .device = {
-        .size       = 128 * 1024 * 1024,
-        .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
-      }
-    }, 
-
-    .scheduler = {
-      .size         = 4096 // 128 // fixme -- this is just for testing -- too big
-    },
-
-    .subblock = {
-      .words        = SKC_DEVICE_SUBBLOCK_WORDS,                         // words per subblock -- pow2
-      .bytes        = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint)       // bytes per subblock -- pow2
-    },
-
-    .block = {
-      .words        = SKC_DEVICE_BLOCK_WORDS,                            // words per block     -- pow2
-      .bytes        = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint),         // bytes per block     -- pow2
-      .subblocks    = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes
-    },
-
-    .block_pool = {
-      .pool_size    = 524288, // blocks in pool -- 128 MB
-      .ring_pow2    = 524288, // blocks in pool rounded up pow2
-      .ring_mask    = 524288 - 1
-    },
-
-    .cq_pool     = {
-#ifndef NDEBUG
-      .type         = SKC_CQ_TYPE_IN_ORDER_PROFILING,
-#else
-      .type         = 0,
-#endif
-      .size         = 8
-    },
-
-    .handle_pool = {
-      .size         = 262144,  // large fraction of block pool size (for now, 1:2)
-      .width        = SKC_RECLAIM_ARRAY_SIZE,
-      .recs         = 256      // too many?  too few?
-    },
-
-    .tile = {
-      .width        = SKC_TILE_WIDTH,                  // tile width  in pixels
-      .height       = SKC_TILE_HEIGHT,                 // tile height in pixels
-      .ratio        = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB
-    },
-
-    .paths_copy = {
-
-      .buffer = {
-        .count      = 16   // # of subbufs in buffer
-      },
-
-      .subbuf = {
-        .count      = 1024 // # of blocks/commands in subbuf
-      },
-
-      .block = {
-        .subbuf     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024,     // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
-        .buffer     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count
-      },
-
-      .command = {
-        .subbuf     = sizeof(skc_uint) * 1024,     // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
-        .buffer     = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count
-      },
-
-      // skc_uint paths_lowat;
-    },
-
-    .raster_cohort = {
-      .path_ids = {
-        .elem_count = 8192,
-        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
-      },
-
-      .transforms = {
-        .elem_count = 8192,
-        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
-      },
-
-      .clips = {
-        .elem_count = 8192,
-        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
-      },
-
-      .fill = {
-        .elem_count = 8192,
-        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
-      },
-
-      .raster_ids = {
-        .elem_count = 8192,
-        .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256
-      },
-
-      .expand = {
-        .cmds       = 1024*128,
-      },
-
-      .rasterize = {
-        .keys       = 1024*1024
-      }
-    },
-
-    .composition = {
-      .cmds = {
-        .elem_count = 1024*16,
-        .snap_count = 1024
-      },
-      .raster_ids = {
-        .elem_count = 1024*1024
-      },
-      .keys = {
-        .elem_count = 1024*1024,
-      }
-    },
-  };
-
-//
-//
-//
-
-static char const cl_build_options_optimized[] =
-  "-cl-std=CL1.2 "
-  "-cl-single-precision-constant "
-  "-cl-denorms-are-zero "
-  "-cl-mad-enable "
-  "-cl-no-signed-zeros "
-  "-cl-fast-relaxed-math "
-  "-cl-kernel-arg-info ";
-
-static char const cl_build_options_debug[] =
-  "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc";
-
-// #define SKC_BUILD_OPTIONS cl_build_options_debug
-#define SKC_BUILD_OPTIONS    cl_build_options_optimized
-
-//
-//
-//
-
-struct skc_program_source
-{
-  char   const * name;
-  char   const * options;
-  char   const * src;
-  size_t const   srclen;
-};
-
-//
-// THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE
-// AND ITS BUILD OPTIONS
-//
-
-union skc_program_sources
-{
-  struct {
-    struct skc_program_source block_pool_init;
-    struct skc_program_source paths_copy;
-    struct skc_program_source fills_expand;
-    struct skc_program_source rasterize;
-    struct skc_program_source segment_ttrk;
-    struct skc_program_source rasters_alloc;
-    struct skc_program_source prefix;
-    struct skc_program_source place;
-    struct skc_program_source segment_ttck;
-    struct skc_program_source render;
-    struct skc_program_source paths_reclaim;
-    struct skc_program_source rasters_reclaim;
-  };
-  struct skc_program_source   sources[];
-};
-
-typedef size_t * (*skc_grid_shaper)(size_t    const work_size,
-                                    cl_uint * const work_dim,
-                                    size_t  * const global_work_size,
-                                    size_t  * const local_work_size);
-struct skc_program_kernel
-{
-  char const *         name;
-  skc_grid_shaper      shaper;
-  skc_device_kernel_id id;
-};
-
-union skc_program_kernels
-{
-  struct {
-    struct skc_program_kernel block_pool_init[2];
-    struct skc_program_kernel paths_copy     [2];
-    struct skc_program_kernel fills_expand   [1];
-    struct skc_program_kernel rasterize      [6];
-    struct skc_program_kernel segment_ttrk   [1];
-    struct skc_program_kernel rasters_alloc  [1];
-    struct skc_program_kernel prefix         [1];
-    struct skc_program_kernel place          [1];
-    struct skc_program_kernel segment_ttck   [1];
-    struct skc_program_kernel render         [1];
-    struct skc_program_kernel paths_reclaim  [1];
-    struct skc_program_kernel rasters_reclaim[1];
-  };
-  struct skc_program_kernel   kernels[];
-};
-
-//
-//
-//
-
-#if     SKC_KERNEL_SPIRV  // PROGRAM IS SPIR-V
-#define SKC_KERNEL_SUFFIX(n) n ## _pre_spv
-#elif   SKC_KERNEL_BINARY // PROGRAM IS BINARY
-#define SKC_KERNEL_SUFFIX(n) n ## _pre_ir
-#elif   SKC_KERNEL_SRC    // PROGRAM IS SOURCE CODE
-#define SKC_KERNEL_SUFFIX(n) n ## _pre_cl
-#else
-#error  "SKC_KERNEL_???"
-#endif
-
-//
-//
-//
-
-#define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) }
-#define SKC_PROGRAM_SOURCE(k,o)          SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o)
-#define SKC_PROGRAM_KERNEL(k)            "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k)
-
-//
-//
-//
-
-static
-size_t *
-skc_device_shaper_block_pool_init_ids(size_t    const work_size,
-                                      cl_uint * const work_dim,
-                                      size_t  * const work_global,
-                                      size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = work_size;
-
-  return NULL; // let runtime figure out local work size
-}
-
-static
-size_t *
-skc_device_shaper_block_pool_init_atomics(size_t    const work_size,
-                                          cl_uint * const work_dim,
-                                          size_t  * const work_global,
-                                          size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = 2;
-
-  return NULL; // let runtime figure out local work size
-}
-
-static
-size_t *
-skc_device_shaper_paths_alloc(size_t    const work_size,
-                              cl_uint * const work_dim,
-                              size_t  * const work_global,
-                              size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = 1;
-
-  return NULL; // let runtime figure out local work size
-}
-
-
-static
-size_t *
-skc_device_shaper_paths_copy(size_t    const work_size,
-                             cl_uint * const work_dim,
-                             size_t  * const work_global,
-                             size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size;
-#if 0
-  work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE;
-
-  return work_local;
-#else  
-  return NULL; // let runtime figure out local work size
-#endif
-}
-
-static
-size_t *
-skc_device_shaper_fills_expand(size_t    const work_size,
-                               cl_uint * const work_dim,
-                               size_t  * const work_global,
-                               size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size;
-  work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_rasterize(size_t    const work_size,
-                            cl_uint * const work_dim,
-                            size_t  * const work_global,
-                            size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size;
-  work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_all(size_t    const work_size,
-                                cl_uint * const work_dim,
-                                size_t  * const work_global,
-                                size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_lines(size_t    const work_size,
-                                  cl_uint * const work_dim,
-                                  size_t  * const work_global,
-                                  size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_quads(size_t    const work_size,
-                                  cl_uint * const work_dim,
-                                  size_t  * const work_global,
-                                  size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_cubics(size_t    const work_size,
-                                   cl_uint * const work_dim,
-                                   size_t  * const work_global,
-                                   size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_rat_quads(size_t    const work_size,
-                                      cl_uint * const work_dim,
-                                      size_t  * const work_global,
-                                      size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasterize_rat_cubics(size_t    const work_size,
-                                       cl_uint * const work_dim,
-                                       size_t  * const work_global,
-                                       size_t  * const work_local)
-{
-  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
-}
-
-static
-size_t *
-skc_device_shaper_rasters_alloc(size_t    const work_size,
-                                cl_uint * const work_dim,
-                                size_t  * const work_global,
-                                size_t  * const work_local)
-{
-  // round up to whole groups
-  size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE);
-
-  work_dim   [0] = 1;
-  work_global[0] = gs;
-  work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_segment_ttrk(size_t    const work_size,
-                               cl_uint * const work_dim,
-                               size_t  * const work_global,
-                               size_t  * const work_local)
-{
-  // work_size is number of keys -- round up to a whole slab
-  size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
-
-  work_dim   [0] = 1;
-  work_global[0] = keys_ru / HS_KEYS_PER_LANE;
-  work_local [0] = HS_LANES_PER_WARP; // or just return NULL
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_segment_ttck(size_t    const work_size,
-                               cl_uint * const work_dim,
-                               size_t  * const work_global,
-                               size_t  * const work_local)
-{
-  // work_size is number of keys -- round up to a whole slab
-  size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
-
-  work_dim   [0] = 1;
-  work_global[0] = keys_ru / HS_KEYS_PER_LANE;
-  work_local [0] = HS_LANES_PER_WARP; // or just return NULL
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_prefix(size_t    const work_size,
-                         cl_uint * const work_dim,
-                         size_t  * const work_global,
-                         size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size;
-  work_local [0] = SKC_PREFIX_SUBGROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_place(size_t    const work_size,
-                        cl_uint * const work_dim,
-                        size_t  * const work_global,
-                        size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size;
-  work_local [0] = SKC_PLACE_SUBGROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_render(size_t    const work_size,
-                         cl_uint * const work_dim,
-                         size_t  * const work_global,
-                         size_t  * const work_local)
-{
-  work_dim   [0] = 1;
-  work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size;
-  work_local [0] = SKC_RENDER_SUBGROUP_SIZE;
-
-  return work_local;
-}
-
-static
-size_t *
-skc_device_shaper_paths_reclaim(size_t    const work_size,
-                                cl_uint * const work_dim,
-                                size_t  * const work_global,
-                                size_t  * const work_local)
-{
-  assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
-
-  work_dim   [0] = 1;
-  work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
-
-  return NULL; // let runtime figure out local work size
-}
-
-static
-size_t *
-skc_device_shaper_rasters_reclaim(size_t    const work_size,
-                                  cl_uint * const work_dim,
-                                  size_t  * const work_global,
-                                  size_t  * const work_local)
-{
-  assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
-
-  work_dim   [0] = 1;
-  work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
-
-  return NULL; // let runtime figure out local work size
-}
-
-//
-//
-//
-
-static union skc_program_sources const program_sources = {
-  SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(paths_copy,     SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(fills_expand,   SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(rasterize,      SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(segment_ttrk,   SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(rasters_alloc,  SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(prefix,         SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(place,          SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(segment_ttck,   SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(render,         SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(paths_reclaim,  SKC_BUILD_OPTIONS),
-  SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS)
-};
-
-static union skc_program_kernels const program_kernels = {
-
-  .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids),     SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS     },
-                       { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } },
-
-  .paths_copy      = { { SKC_PROGRAM_KERNEL(paths_alloc),             SKC_DEVICE_KERNEL_ID_PATHS_ALLOC             },
-                       { SKC_PROGRAM_KERNEL(paths_copy) ,             SKC_DEVICE_KERNEL_ID_PATHS_COPY              } },
-
-  .fills_expand    = { { SKC_PROGRAM_KERNEL(fills_expand),            SKC_DEVICE_KERNEL_ID_FILLS_EXPAND            } },
-
-  .rasterize       = { { SKC_PROGRAM_KERNEL(rasterize_all),           SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL           },
-                       { SKC_PROGRAM_KERNEL(rasterize_lines),         SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES         },
-                       { SKC_PROGRAM_KERNEL(rasterize_quads),         SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS         },
-                       { SKC_PROGRAM_KERNEL(rasterize_cubics),        SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS        },
-                       { SKC_PROGRAM_KERNEL(rasterize_rat_quads),     SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS     },
-                       { SKC_PROGRAM_KERNEL(rasterize_rat_cubics),    SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS    } },
-
-  .segment_ttrk    = { { SKC_PROGRAM_KERNEL(segment_ttrk),            SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK            } },
-
-  .rasters_alloc   = { { SKC_PROGRAM_KERNEL(rasters_alloc),           SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC           } },
-
-  .prefix          = { { SKC_PROGRAM_KERNEL(prefix),                  SKC_DEVICE_KERNEL_ID_PREFIX                  } },
-
-  .place           = { { SKC_PROGRAM_KERNEL(place),                   SKC_DEVICE_KERNEL_ID_PLACE                   } },
-
-  .segment_ttck    = { { SKC_PROGRAM_KERNEL(segment_ttck) ,           SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK            } },
-
-  .render          = { { SKC_PROGRAM_KERNEL(render),                  SKC_DEVICE_KERNEL_ID_RENDER                  } },
-
-  .paths_reclaim   = { { SKC_PROGRAM_KERNEL(paths_reclaim),           SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM           } },
-
-  .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim),         SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM         } }
-};
-
-//
-//
-//
-
-struct skc_device
-{
-  //
-  // FIXME -- an OpenCL 2.1+ device would clone these kernels in a
-  // multithreaded system.  
-  //
-  // Not having the ability to clone kernels (yet set their sticky
-  // args) was an oversight in previous versions of OpenCL.
-  //
-  // For now, we can probably get away with just a single kernel
-  // instance as long as args are set and the kernel is launched
-  // before having its arguments stomped on.
-  //
-  cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT];
-  size_t    reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3];
-};
-
-//
-// CREATE KERNELS
-//
-
-static
-void
-skc_device_create_kernels(struct skc_runtime              * const runtime,
-                          struct skc_program_kernel const * const kernels,
-                          skc_uint                          const kernel_count,
-                          cl_program                              program)
-{
-  for (skc_uint ii=0; ii<kernel_count; ii++)
-    {
-      cl_int cl_err;
-
-      char     const * name = kernels[ii].name;
-      skc_uint const   id   = kernels[ii].id;
-
-      fprintf(stderr,"\t\"%s\"\n",name);
-
-      // create the kernel
-      runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err);
-
-      //
-      // release program now
-      //
-      // FIXME -- if/when we multithread then we need to clone kernels
-      // (>=2.1) or keep programs around (<=2.0)
-      //
-
-      // get workgroup size
-      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
-                                runtime->cl.device_id,
-                                CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
-                                sizeof(runtime->device->reqd_szs[0]),
-                                runtime->device->reqd_szs[id],
-                                NULL));
-
-      //
-      // GEN9+ PROBING
-      //
-#define SKC_TARGET_GEN9
-#ifdef  SKC_TARGET_GEN9
-
-#define CL_DEVICE_SUB_GROUP_SIZES_INTEL         0x4108
-#define CL_KERNEL_SPILL_MEM_SIZE_INTEL          0x4109
-#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL  0x410A
-
-      cl_ulong spill_mem_size;
-
-      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
-                                runtime->cl.device_id,
-                                CL_KERNEL_SPILL_MEM_SIZE_INTEL,
-                                sizeof(spill_mem_size),
-                                &spill_mem_size,
-                                NULL));
-
-      fprintf(stderr,"\t\tspill mem size: %lu bytes\n",
-              (unsigned long)spill_mem_size);
-
-      cl_ulong local_mem_size;
-
-      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
-                                runtime->cl.device_id,
-                                CL_KERNEL_LOCAL_MEM_SIZE,
-                                sizeof(local_mem_size),
-                                &local_mem_size,
-                                NULL));
-
-      fprintf(stderr,"\t\tlocal mem size: %lu bytes\n",
-              (unsigned long)local_mem_size);
-#endif
-    }
-}
-
-static
-void
-skc_device_build_program(struct skc_runtime              * const runtime,
-                         struct skc_program_source const * const source,
-                         struct skc_program_kernel const * const kernels,
-                         skc_uint                          const kernel_count)
-{
-  cl_program program;
-
-  fprintf(stderr,"%-20s: ",source->name);
-
-  cl_int cl_err;
-
-#if   SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
-
-  fprintf(stderr,"Creating (SPIR-V) ... ");
-
-  program = clCreateProgramWithIL(runtime->cl.context,
-                                  source->src,
-                                  source->srclen,
-                                  &cl_err);
-
-#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
-
-  fprintf(stderr,"Creating (Binary) ... ");
-
-  cl_int status;
-  program = clCreateProgramWithBinary(runtime->cl.context,
-                                      1,
-                                      &runtime->cl.device_id,
-                                      &source->srclen,
-                                      (unsigned char const *[]){ source->src },
-                                      &status,
-                                      &cl_err);
-
-#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
-
-  fprintf(stderr,"Creating (Source) ... ");
-
-  program = clCreateProgramWithSource(runtime->cl.context,
-                                      1,
-                                      (char const *[]){ source->src },
-                                      &source->srclen,
-                                      &cl_err);
-#else
-
-#error "SKC_KERNEL_???"
-
-#endif
-
-  cl_ok(cl_err);
-
-  fprintf(stderr,"Building ... ");
-
-  // build the program
-  cl(BuildProgram(program,
-                  1,
-                  &runtime->cl.device_id,
-                  source->options, // build options are ignored by binary
-                  NULL,
-                  NULL));
-
-  fprintf(stderr,"Done\n");
-
-  // build the kernels
-  skc_device_create_kernels(runtime,kernels,kernel_count,program);
-
-  // we're done with program for now
-  // can always recover it from a kernel instance
-  cl(ReleaseProgram(program));
-}
-
-//
-// RELEASE KERNELS
-//
-
-static
-void
-skc_device_release_kernels(struct skc_device * const device)
-{
-  for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++)
-    cl(ReleaseKernel(device->kernels[ii]));
-}
-
-
-
-cl_kernel
-skc_device_acquire_kernel(struct skc_device  * const device,
-                          skc_device_kernel_id const type)
-{
-  cl_kernel kernel = device->kernels[type];
-
-  cl(RetainKernel(kernel));
-
-  return kernel;
-}
-
-//
-// INITIALIZE KERNEL ARGS
-//
-// FIXME
-//
-// pre-assign any kernel arguments that are never going to change --
-// for example, the block pool
-//
-
-//
-//
-//
-
-#define SKC_DEVICE_BUILD_PROGRAM(p) \
-  skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p))
-
-
-void
-skc_device_create(struct skc_runtime * const runtime)
-{
-  struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device));
-
-  // hang device off of runtime
-  runtime->device = device;
-
-  // hang config off of runtime
-  runtime->config = &config;
-
-  // create kernels
-  SKC_DEVICE_BUILD_PROGRAM(block_pool_init);
-  SKC_DEVICE_BUILD_PROGRAM(paths_copy);
-  SKC_DEVICE_BUILD_PROGRAM(fills_expand);
-  SKC_DEVICE_BUILD_PROGRAM(rasterize);
-  SKC_DEVICE_BUILD_PROGRAM(segment_ttrk);
-  SKC_DEVICE_BUILD_PROGRAM(rasters_alloc);
-  SKC_DEVICE_BUILD_PROGRAM(prefix);
-  SKC_DEVICE_BUILD_PROGRAM(place);
-  SKC_DEVICE_BUILD_PROGRAM(segment_ttck);
-  SKC_DEVICE_BUILD_PROGRAM(render);
-  SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
-  SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
-
-  // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up
-  hs_create(runtime->cl.context,runtime->cl.device_id,NULL);
-}
-
-void
-skc_device_dispose(struct skc_runtime * const runtime)
-{
-  //
-  // FIXME -- dispose of programs, kernels, etc.
-  //
-
-  skc_runtime_host_perm_free(runtime,runtime->device);
-}
-
-//
-// FIXME -- just pass the device type
-//
-
-void
-skc_device_enqueue_kernel(struct skc_device  * const device,
-                          skc_device_kernel_id const type,
-                          cl_command_queue           cq,
-                          cl_kernel                  kernel,
-                          size_t               const work_size,
-                          cl_uint                    num_events_in_wait_list,
-                          cl_event const     * const event_wait_list,
-                          cl_event           * const event)
-{
-  if (work_size == 0)
-    return;
-
-  cl_uint  work_dim   [1];
-  size_t   work_global[3];
-  size_t   work_local [3];
-
-  size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size,
-                                                                 work_dim,
-                                                                 work_global,
-                                                                 work_local);
-  cl(EnqueueNDRangeKernel(cq,
-                          kernel,// device->kernels[type],
-                          work_dim[0],
-                          NULL,
-                          work_global,
-                          work_local_ptr,
-                          num_events_in_wait_list,
-                          event_wait_list,
-                          event));
-}
-
-//
-//
-//
diff --git a/src/compute/skc/device_cl_12_gen9.h b/src/compute/skc/device_cl_12_gen9.h
deleted file mode 100644
index dd69a845c2..0000000000
--- a/src/compute/skc/device_cl_12_gen9.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_ONCE_DEVICE_CL_12_GEN9_H
-#define SKC_ONCE_DEVICE_CL_12_GEN9_H
-
-//
-// FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE
-//
-// THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED
-//
-
-#include "block.h"
-
-//
-// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES:
-//
-// 1) The subblock size should match the natural SIMT/SIMD width of
-//    the target device.
-//
-// 2) Either a square or rectangular (1:2) tile size is chosen.  The
-//    tile size is usually determined by the amount of SMEM available
-//    to a render kernel subgroup and desired multiprocessor
-//    occupancy.
-//
-// 3) If the tile is rectangular then the block size must be at least
-//    twice the size of the subblock size.
-//
-// 4) A large block size can decrease allocation overhead but there
-//    will be diminishing returns as the block size increases.
-//
-
-#define SKC_DEVICE_BLOCK_WORDS_LOG2             6  // CHANGE "WORDS" TO "SIZE" ?
-#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2          3
-
-#define SKC_TILE_WIDTH_LOG2                     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-#define SKC_TILE_HEIGHT_LOG2                    (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1)
-
-/////////////////////////////////////////////////////////////////
-//
-// BLOCK POOL INIT
-//
-
-#define SKC_BP_INIT_IDS_KERNEL_ATTRIBS
-#define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS      __attribute__((reqd_work_group_size(2,1,1)))
-
-/////////////////////////////////////////////////////////////////
-//
-// PATHS ALLOC
-//
-
-#define SKC_PATHS_ALLOC_KERNEL_ATTRIBS          __attribute__((reqd_work_group_size(1,1,1)))
-
-/////////////////////////////////////////////////////////////////
-//
-// PATHS COPY
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2       SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
-#define SKC_PATHS_COPY_ELEM_WORDS               1
-#define SKC_PATHS_COPY_ELEM_EXPAND()            SKC_EXPAND_1()
-
-#define SKC_PATHS_COPY_KERNEL_ATTRIBS           __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE)))
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I)              ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS)
-
-typedef skc_uint  skc_paths_copy_elem;
-typedef skc_uint  skc_pb_idx_v;
-
-/////////////////////////////////////////////////////////////////
-//
-// FILLS EXPAND
-//
-
-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-#define SKC_FILLS_EXPAND_ELEM_WORDS             1
-
-#define SKC_FILLS_EXPAND_KERNEL_ATTRIBS         __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE)))
-
-/////////////////////////////////////////////////////////////////
-//
-// RASTER ALLOC
-//
-// NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is
-// why use of the subgroup broadcast produces a compiler error. So a
-// subgroup of size 16 is this widest we can require.
-//
-
-#define SKC_RASTERS_ALLOC_GROUP_SIZE            16
-
-#if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16)
-
-#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE)))
-#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_sub_group_local_id()
-#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      sub_group_scan_inclusive_add(v)
-#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        sub_group_broadcast(v,i)
-
-#else
-
-#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1)))
-#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_local_id(0)
-#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      work_group_scan_inclusive_add(v)
-#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        work_group_broadcast(v,i)
-
-#endif
-
-/////////////////////////////////////////////////////////////////
-//
-// RASTERIZE
-//
-
-#define SKC_RASTERIZE_SUBGROUP_SIZE             SKC_DEVICE_SUBBLOCK_WORDS
-#define SKC_RASTERIZE_VECTOR_SIZE_LOG2          0
-#define SKC_RASTERIZE_WORKGROUP_SUBGROUPS       1
-
-#define SKC_RASTERIZE_KERNEL_ATTRIBS                                    \
-  __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \
-  __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1)))
-
-#define SKC_RASTERIZE_FLOAT                     float
-#define SKC_RASTERIZE_UINT                      uint
-#define SKC_RASTERIZE_INT                       int
-#define SKC_RASTERIZE_PREDICATE                 bool
-#define SKC_RASTERIZE_POOL                      uint
-
-#define SKC_RASTERIZE_TILE_HASH_X_BITS          1
-#define SKC_RASTERIZE_TILE_HASH_Y_BITS          2
-
-typedef skc_block_id_t skc_block_id_v_t;
-typedef skc_uint2      skc_ttsk_v_t;
-typedef skc_uint2      skc_ttsk_s_t;
-
-// SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE);
-
-/////////////////////////////////////////////////////////////////
-//
-// PREFIX
-//
-
-#define SKC_PREFIX_SUBGROUP_SIZE               8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS
-#define SKC_PREFIX_WORKGROUP_SUBGROUPS         1
-
-#define SKC_PREFIX_KERNEL_ATTRIBS                                       \
-  __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE)))  \
-  __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1)))
-
-#define SKC_PREFIX_TTP_V                       skc_uint2
-#define SKC_PREFIX_TTS_V_BITFIELD              skc_int
-
-#define SKC_PREFIX_TTS_VECTOR_INT_EXPAND       SKC_EXPAND_1
-
-#define SKC_PREFIX_SMEM_ZERO                   ulong
-#define SKC_PREFIX_SMEM_ZERO_WIDTH             (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t))
-#define SKC_PREFIX_SMEM_COUNT_BLOCK_ID         8
-
-#define SKC_PREFIX_BLOCK_ID_V_SIZE             SKC_PREFIX_SUBGROUP_SIZE
-
-#define SKC_PREFIX_TTXK_V_SIZE                 SKC_PREFIX_SUBGROUP_SIZE
-#define SKC_PREFIX_TTXK_V_MASK                 (SKC_PREFIX_TTXK_V_SIZE - 1)
-
-typedef skc_uint       skc_bp_elem_t;
-
-typedef skc_uint2      skc_ttrk_e_t;
-typedef skc_uint2      skc_ttsk_v_t;
-typedef skc_uint2      skc_ttsk_s_t;
-typedef skc_uint2      skc_ttpk_s_t;
-typedef skc_uint2      skc_ttxk_v_t;
-
-typedef skc_int        skc_tts_v_t;
-
-typedef skc_int        skc_ttp_t;
-
-typedef skc_uint       skc_raster_yx_s;
-
-typedef skc_block_id_t skc_block_id_v_t;
-typedef skc_block_id_t skc_block_id_s_t;
-
-/////////////////////////////////////////////////////////////////
-//
-// PLACE
-//
-
-#define SKC_PLACE_SUBGROUP_SIZE                16
-#define SKC_PLACE_WORKGROUP_SUBGROUPS          1
-
-#define SKC_PLACE_KERNEL_ATTRIBS                                       \
-  __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE)))  \
-  __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1)))
-
-typedef skc_uint  skc_bp_elem_t;
-
-typedef skc_uint  skc_ttsk_lo_t;
-typedef skc_uint  skc_ttsk_hi_t;
-
-typedef skc_uint  skc_ttpk_lo_t;
-typedef skc_uint  skc_ttpk_hi_t;
-
-typedef skc_uint  skc_ttxk_lo_t;
-typedef skc_uint  skc_ttxk_hi_t;
-
-typedef skc_uint2 skc_ttck_t;
-
-typedef skc_bool  skc_pred_v_t;
-typedef skc_int   skc_int_v_t;
-
-/////////////////////////////////////////////////////////////////
-//
-// RENDER
-//
-
-#define SKC_ARCH_GEN9
-
-#if defined(__OPENCL_C_VERSION__)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#define SKC_RENDER_SUBGROUP_SIZE               8
-#define SKC_RENDER_WORKGROUP_SUBGROUPS         1
-
-#define SKC_RENDER_KERNEL_ATTRIBS                                       \
-  __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE)))  \
-  __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1)))
-
-#define SKC_RENDER_SCANLINE_VECTOR_SIZE        2
-
-#define SKC_RENDER_REGS_COLOR_R                2
-#define SKC_RENDER_REGS_COVER_R                3
-
-#define SKC_RENDER_TTSB_EXPAND()               SKC_EXPAND_1()
-
-#define SKC_RENDER_TTS_V                       skc_int
-#define SKC_RENDER_TTS_V_BITFIELD              skc_int
-
-#define SKC_RENDER_TTP_V                       skc_int2
-#define SKC_RENDER_AREA_V                      skc_int2
-
-#define SKC_RENDER_TILE_COLOR_PAIR             half2
-#define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v)   vload2(x,v)
-
-#define SKC_RENDER_SURFACE_COLOR               half4
-#define SKC_RENDER_SURFACE_WRITE               write_imageh
-
-// #define SKC_RENDER_TTXB_VECTOR_INT             int2
-// #define SKC_RENDER_TTXB_VECTOR_UINT            uint2
-
-#define SKC_RENDER_WIDE_AA                     ulong // SLM = 64 bytes/clock
-
-#define SKC_RENDER_TILE_COLOR                  half2
-#define SKC_RENDER_TILE_COVER                  half2
-
-#define SKC_RENDER_ACC_COVER_INT               int2
-#define SKC_RENDER_ACC_COVER_UINT              uint2
-
-#define SKC_RENDER_GRADIENT_FLOAT              float2
-#define SKC_RENDER_GRADIENT_INT                int2
-#define SKC_RENDER_GRADIENT_STOP               int2
-#define SKC_RENDER_GRADIENT_FRAC               half2
-#define SKC_RENDER_GRADIENT_COLOR_STOP         half
-
-#define SKC_RENDER_SURFACE_U8_RGBA             uint2
-
-#define SKC_RENDER_TILE_COLOR_VECTOR           uint16
-#define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint
-#define SKC_RENDER_TILE_COLOR_VECTOR_COUNT     ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR))
-
-/////////////////////////////////////////////////////////////////
-//
-// PATHS & RASTERS RECLAIM
-//
-// FIXME -- investigate enabling the stride option for a smaller grid
-// that iterates over a fixed number of threads.  Since reclamation is
-// a low-priority task, it's probably reasonable to trade longer
-// reclamation times for lower occupancy of the device because it
-// might delay the fastpath of the pipeline.
-//
-
-#define SKC_RECLAIM_ARRAY_SIZE                  (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice
-
-/////////////////////////////////////////////////////////////////
-//
-// PATHS RECLAIM
-//
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2    SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
-#define SKC_PATHS_RECLAIM_LOCAL_ELEMS           1
-#define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE)))
-
-/////////////////////////////////////////////////////////////////
-//
-// RASTERS RECLAIM
-//
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2  SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
-#define SKC_RASTERS_RECLAIM_LOCAL_ELEMS         1
-#define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS      __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)))
-
-//
-// COMMON -- FIXME -- HOIST THESE ELSEWHERE
-//
-
-#define SKC_DEVICE_BLOCK_WORDS                 (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
-#define SKC_DEVICE_SUBBLOCK_WORDS              (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_DEVICE_BLOCK_DWORDS                (SKC_DEVICE_BLOCK_WORDS / 2)
-
-#define SKC_DEVICE_BLOCK_WORDS_MASK            SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2)
-#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK    SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)
-
-#define SKC_TILE_RATIO                         (SKC_TILE_HEIGHT / SKC_TILE_WIDTH)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE           (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2)
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE        (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE      (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2)
-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE         (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2)
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/export_cl_12.h b/src/compute/skc/export_cl_12.h
deleted file mode 100644
index e577282791..0000000000
--- a/src/compute/skc/export_cl_12.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "skc.h"
-
-//
-// 
-//
-
-skc_err
-skc_path_builder_cl_12_create(struct skc_context        * const context,
-                              struct skc_path_builder * * const path_builder);
-
-//
-// 
-//
-
-skc_err
-skc_raster_builder_cl_12_create(struct skc_context          * const context,
-                                struct skc_raster_builder * * const raster_builder);
-
-//
-//
-//
-
-skc_err
-skc_composition_cl_12_create(struct skc_context       * const context,
-                             struct skc_composition * * const composition);
-
-//
-//
-//
-
-skc_err
-skc_styling_cl_12_create(struct skc_context   * const context,
-                         struct skc_styling * * const styling,
-                         uint32_t               const layers_count,
-                         uint32_t               const groups_count,
-                         uint32_t               const extras_count);
-
-//
-//
-//
-
-skc_err
-skc_surface_cl_12_create(struct skc_context   * const context,
-                         struct skc_surface * * const surface);
-
-//
-//
-//
-
diff --git a/src/compute/skc/extent_cl_12.c b/src/compute/skc/extent_cl_12.c
deleted file mode 100644
index 73676d8063..0000000000
--- a/src/compute/skc/extent_cl_12.c
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdlib.h>
-
-#include "common/cl/assert_cl.h"
-#include "extent_cl_12.h"
-#include "runtime_cl_12.h"
-
-//
-// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY
-//
-
-void
-skc_extent_phrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_phrw * const extent,
-                      size_t                   const size)
-{
-  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size);
-}
-
-void
-skc_extent_phrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_phrw * const extent)
-{
-  skc_runtime_host_perm_free(runtime,extent->hrw);
-}
-
-//
-// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP
-//
-
-void
-skc_extent_pdrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_pdrw * const extent,
-                      size_t                   const size)
-{
-  extent->drw = skc_runtime_device_perm_alloc(runtime,
-                                              CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
-                                              size);
-}
-
-void
-skc_extent_pdrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_pdrw * const extent)
-{
-  skc_runtime_device_perm_free(runtime,extent->drw);
-}
-
-//
-// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING
-//
-
-void
-skc_extent_tdrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_tdrw * const extent,
-                      size_t                   const size)
-{
-  extent->size = size;
-  extent->drw  = skc_runtime_device_temp_alloc(runtime,
-                                               CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
-                                               size,&extent->id,NULL);
-}
-
-void
-skc_extent_tdrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_tdrw * const extent)
-{
-  skc_runtime_device_temp_free(runtime,extent->drw,extent->id);
-}
-
-void
-skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent,
-                     cl_command_queue         const cq,
-                     cl_event               * const event)
-{
-  if (extent->size == 0)
-    return;
-
-  skc_uint const zero = 0;
-
-  cl(EnqueueFillBuffer(cq,
-                       extent->drw,
-                       &zero,
-                       sizeof(zero),
-                       0,
-                       extent->size,
-                       0,NULL,event));
-}
-
-//
-// DURABLE SMALL EXTENTS BACKING ATOMICS
-//
-
-void
-skc_extent_phr_pdrw_alloc(struct skc_runtime         * const runtime,
-                          struct skc_extent_phr_pdrw * const extent,
-                          size_t                       const size)
-{
-  extent->size = size;
-  extent->hr   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_ONLY,size);
-  extent->drw  = skc_runtime_device_perm_alloc(runtime,CL_MEM_READ_WRITE,size);
-}
-
-void
-skc_extent_phr_pdrw_free(struct skc_runtime         * const runtime,
-                         struct skc_extent_phr_pdrw * const extent)
-{
-  skc_runtime_host_perm_free(runtime,extent->hr);
-  skc_runtime_device_perm_free(runtime,extent->drw);
-}
-
-void
-skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event)
-{
-  if (extent->size == 0)
-    return;
-
-  cl(EnqueueReadBuffer(cq, 
-                       extent->drw,
-                       CL_FALSE,
-                       0,
-                       extent->size,
-                       extent->hr,
-                       0,NULL,event));
-}
-
-void
-skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event)
-{
-  if (extent->size == 0)
-    return;
-
-  skc_uint const zero = 0;
-
-  cl(EnqueueFillBuffer(cq,
-                       extent->drw,
-                       &zero,
-                       sizeof(zero),
-                       0,
-                       extent->size,
-                       0,NULL,event));
-}
-
-//
-// EPHEMERAL SMALL EXTENTS BACKING ATOMICS
-//
-
-void
-skc_extent_thr_tdrw_alloc(struct skc_runtime         * const runtime,
-                          struct skc_extent_thr_tdrw * const extent,
-                          size_t                       const size)
-{
-  extent->size = size;
-  extent->hr   = skc_runtime_host_temp_alloc(runtime,
-                                             SKC_MEM_FLAGS_READ_WRITE,
-                                             size,&extent->id.hr,NULL);
-  extent->drw  = skc_runtime_device_temp_alloc(runtime,
-                                               CL_MEM_READ_WRITE,
-                                               size,
-                                               &extent->id.drw,
-                                               NULL);
-}
-
-void
-skc_extent_thr_tdrw_free(struct skc_runtime         * const runtime,
-                         struct skc_extent_thr_tdrw * const extent)
-{
-  skc_runtime_host_temp_free(runtime,extent->hr,extent->id.hr);
-  skc_runtime_device_temp_free(runtime,extent->drw,extent->id.drw);
-}
-
-void
-skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event)
-{
-  if (extent->size == 0)
-    return;
-
-  cl(EnqueueReadBuffer(cq, 
-                       extent->drw,
-                       CL_FALSE,
-                       0,
-                       extent->size,
-                       extent->hr,
-                       0,NULL,event));
-}
-
-void
-skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event)
-{
-  if (extent->size == 0)
-    return;
-
-  skc_uint const zero = 0;
-
-  cl(EnqueueFillBuffer(cq,
-                       extent->drw,
-                       &zero,
-                       sizeof(zero),
-                       0,
-                       extent->size,
-                       0,NULL,event));
-}
-
-//
-// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
-//
-
-void
-skc_extent_phw1g_tdrNs_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phw1g_tdrNs * const extent,
-                             size_t                          const size)
-{
-  extent->hw1 = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_WRITE_ONLY,size);
-}
-
-void
-skc_extent_phw1g_tdrNs_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phw1g_tdrNs * const extent)
-{
-  skc_runtime_host_perm_free(runtime,extent->hw1);
-}
-
-void
-skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phw1g_tdrNs_snap * const snap)
-{
-  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
-}
-
-void
-skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phw1g_tdrNs      * const extent,
-                                  struct skc_extent_phw1g_tdrNs_snap * const snap,
-                                  cl_command_queue                     const cq,
-                                  cl_event                           * const event)
-{
-  struct skc_extent_ring const * const ring = snap->snap->ring;
-
-  skc_uint const count = skc_extent_ring_snap_count(snap->snap);
-  size_t   const size  = count * ring->size.elem;
-
-  snap->drN = skc_runtime_device_temp_alloc(runtime,
-                                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
-                                            size,&snap->id,NULL);
-
-  if (count == 0)
-    return;
-
-  // possibly two copies
-  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
-  skc_uint const count_max = ring->size.pow2 - index_lo;
-  skc_uint const count_lo  = min(count_max,count);
-  size_t   const bytes_lo  = count_lo * ring->size.elem;
-
-  if (count > count_max)
-    {
-      skc_uint const bytes_hi = (count - count_max) * ring->size.elem;
-
-      cl(EnqueueWriteBuffer(cq,
-                            snap->drN,
-                            CL_FALSE,
-                            bytes_lo,
-                            bytes_hi,
-                            extent->hw1, // offset_hi = 0
-                            0,NULL,NULL));
-    }
-
-  size_t const offset_lo = index_lo * ring->size.elem;
-
-  cl(EnqueueWriteBuffer(cq,
-                        snap->drN,
-                        CL_FALSE,
-                        0,
-                        bytes_lo,
-                        (skc_uchar*)extent->hw1 + offset_lo,
-                        0,NULL,event));
-
-}
-
-void
-skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phw1g_tdrNs_snap * const snap)
-{
-  skc_runtime_device_temp_free(runtime,snap->drN,snap->id);
-  skc_extent_ring_snap_free(runtime,snap->snap);
-}
-
-//
-// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
-//
-
-void
-skc_extent_phrwg_tdrNs_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phrwg_tdrNs * const extent,
-                             size_t                          const size)
-{
-  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE
-}
-
-void
-skc_extent_phrwg_tdrNs_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phrwg_tdrNs * const extent)
-{
-  skc_runtime_host_perm_free(runtime,extent->hrw);
-}
-
-void
-skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phrwg_tdrNs_snap * const snap)
-{
-  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
-}
-
-void
-skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phrwg_tdrNs      * const extent,
-                                  struct skc_extent_phrwg_tdrNs_snap * const snap,
-                                  cl_command_queue                     const cq,
-                                  cl_event                           * const event)
-{
-  struct skc_extent_ring const * const ring = snap->snap->ring;
-
-  skc_uint const count = skc_extent_ring_snap_count(snap->snap);
-  size_t   const size  = count * ring->size.elem;
-
-  snap->drN = skc_runtime_device_temp_alloc(runtime,
-                                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
-                                            size,&snap->id,NULL);
-
-  if (count == 0)
-    return;
-
-  // possibly two copies
-  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
-  skc_uint const count_max = ring->size.pow2 - index_lo;
-  skc_uint const count_lo  = min(count_max,count);
-  size_t   const bytes_lo  = count_lo * ring->size.elem;
-
-  if (count > count_max)
-    {
-      skc_uint const count_hi = count - count_max;
-      skc_uint const bytes_hi = count_hi * ring->size.elem;
-
-      cl(EnqueueWriteBuffer(cq,
-                            snap->drN,
-                            CL_FALSE,
-                            bytes_lo,
-                            bytes_hi,
-                            extent->hrw, // offset_hi = 0
-                            0,NULL,NULL));
-    }
-
-  size_t offset_lo = index_lo * ring->size.elem;
-
-  cl(EnqueueWriteBuffer(cq,
-                        snap->drN,
-                        CL_FALSE,
-                        0,
-                        bytes_lo,
-                        (skc_uchar*)extent->hrw + offset_lo,
-                        0,NULL,event));
-
-}
-
-void
-skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phrwg_tdrNs_snap * const snap)
-{
-  skc_runtime_device_temp_free(runtime,snap->drN,snap->id);
-  skc_extent_ring_snap_free(runtime,snap->snap);
-}
-
-//
-// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT
-//
-// Note that because the ring and snapshot are both in host memory and
-// the snapshot blocks progress until freed we can simply point the
-// fake ephemeral snapshot at the ring's durable extent.
-//
-
-void
-skc_extent_phrwg_thr1s_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phrwg_thr1s * const extent,
-                             size_t                          const size)
-{
-  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE
-}
-
-void
-skc_extent_phrwg_thr1s_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phrwg_thr1s * const extent)
-{
-  skc_runtime_host_perm_free(runtime,extent->hrw);
-}
-
-void
-skc_extent_phrwg_thr1s_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phrwg_thr1s_snap * const snap)
-{
-  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
-}
-
-void
-skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phrwg_thr1s      * const extent,
-                                  struct skc_extent_phrwg_thr1s_snap * const snap)
-{
-  struct skc_extent_ring const * const ring = snap->snap->ring;
-
-  skc_uint const count     = skc_extent_ring_snap_count(snap->snap);
-  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
-  skc_uint const count_max = ring->size.pow2 - index_lo;
-
-  snap->count.lo = min(count_max,count);
-  snap->hr1.lo   = (skc_uchar*)extent->hrw + (index_lo * ring->size.elem);
-
-  if (count > count_max)
-    {
-      snap->count.hi = count - count_max;
-      snap->hr1.hi   = extent->hrw;
-    }
-  else
-    {
-      snap->count.hi = 0;
-      snap->hr1.hi   = NULL;
-    }
-}
-
-void
-skc_extent_phrwg_thr1s_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phrwg_thr1s_snap * const snap)
-{
-  skc_extent_ring_snap_free(runtime,snap->snap);
-}
-
-//
-//
-//
diff --git a/src/compute/skc/extent_cl_12.h b/src/compute/skc/extent_cl_12.h
deleted file mode 100644
index 47ba951bb3..0000000000
--- a/src/compute/skc/extent_cl_12.h
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include <CL/opencl.h>
-
-#include "suballocator.h"
-#include "extent_ring.h"
-
-//
-// Legend:
-//
-//   p  :  durable
-//   t  :  ephemeral
-//   h  :  host
-//   d  :  device
-//   r  :  read
-//   w  :  write
-//   1  :  once -- e.g. w1 is 'write-once'
-//   N  :  many -- e.g. rN is 'read-many'
-//   g  :  ring
-//   s  :  ring snapshot
-//
-// Notes:
-//
-//   rw :  for now, read-write implies read-write many
-//
-
-//
-// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY
-//
-
-struct skc_extent_phrw 
-{
-  void * hrw;
-};
-
-void
-skc_extent_phrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_phrw * const extent,
-                      size_t                   const size);
-
-void
-skc_extent_phrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_phrw * const extent);
-
-//
-// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP
-//
-
-struct skc_extent_pdrw 
-{
-  cl_mem drw;
-};
-
-void
-skc_extent_pdrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_pdrw * const extent,
-                      size_t                   const size);
-
-void
-skc_extent_pdrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_pdrw * const extent);
-
-//
-// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING
-//
-
-struct skc_extent_tdrw 
-{
-  size_t          size;
-  cl_mem          drw;
-  skc_subbuf_id_t id;
-};
-
-void
-skc_extent_tdrw_alloc(struct skc_runtime     * const runtime,
-                      struct skc_extent_tdrw * const extent,
-                      size_t                   const size);
-
-void
-skc_extent_tdrw_free(struct skc_runtime     * const runtime,
-                     struct skc_extent_tdrw * const extent);
-
-void
-skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent,
-                     cl_command_queue         const cq,
-                     cl_event               * const event);
-
-//
-// DURABLE SMALL EXTENTS BACKING ATOMICS
-//
-
-struct skc_extent_phr_pdrw
-{
-  size_t size; // must be multiple of words
-  void * hr;
-  cl_mem drw;
-};
-
-void
-skc_extent_phr_pdrw_alloc(struct skc_runtime         * const runtime,
-                          struct skc_extent_phr_pdrw * const extent,
-                          size_t                       const size);
-
-void
-skc_extent_phr_pdrw_free(struct skc_runtime         * const runtime,
-                         struct skc_extent_phr_pdrw * const extent);
-
-void
-skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event);
-
-void
-skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event);
-
-//
-// EPHEMERAL SMALL EXTENTS BACKING ATOMICS
-//
-
-struct skc_extent_thr_tdrw
-{
-  size_t            size; // must be multiple of words
-
-  void            * hr;
-  cl_mem            drw;
-
-  struct {
-    skc_subbuf_id_t hr;
-    skc_subbuf_id_t drw;
-  } id;
-};
-
-void
-skc_extent_thr_tdrw_alloc(struct skc_runtime         * const runtime,
-                          struct skc_extent_thr_tdrw * const extent,
-                          size_t                       const size);
-
-void
-skc_extent_thr_tdrw_free(struct skc_runtime         * const runtime,
-                         struct skc_extent_thr_tdrw * const extent);
-
-void
-skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event);
-
-void
-skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent,
-                         cl_command_queue             const cq,
-                         cl_event                   * const event);
-
-//
-// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
-//
-
-struct skc_extent_phw1g_tdrNs
-{
-  void * hw1;
-};
-
-struct skc_extent_phw1g_tdrNs_snap
-{
-  struct skc_extent_ring_snap * snap;
-  cl_mem                        drN;
-  skc_subbuf_id_t               id;
-};
-
-void
-skc_extent_phw1g_tdrNs_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phw1g_tdrNs * const extent,
-                             size_t                          const size);
-
-void
-skc_extent_phw1g_tdrNs_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phw1g_tdrNs * const extent);
-
-void
-skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phw1g_tdrNs_snap * const snap);
-
-void
-skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phw1g_tdrNs      * const extent,
-                                  struct skc_extent_phw1g_tdrNs_snap * const snap,
-                                  cl_command_queue                     const cq,
-                                  cl_event                           * const event);
-
-void
-skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phw1g_tdrNs_snap * const snap);
-
-//
-// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
-//
-
-struct skc_extent_phrwg_tdrNs
-{
-  void * hrw;
-};
-
-struct skc_extent_phrwg_tdrNs_snap
-{
-  struct skc_extent_ring_snap * snap;
-  cl_mem                        drN;
-  skc_subbuf_id_t               id;
-};
-
-void
-skc_extent_phrwg_tdrNs_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phrwg_tdrNs * const extent,
-                             size_t                          const size);
-
-void
-skc_extent_phrwg_tdrNs_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phrwg_tdrNs * const extent);
-
-void
-skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phrwg_tdrNs_snap * const snap);
-
-void
-skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phrwg_tdrNs      * const extent,
-                                  struct skc_extent_phrwg_tdrNs_snap * const snap,
-                                  cl_command_queue                     const cq,
-                                  cl_event                           * const event);
-
-void
-skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phrwg_tdrNs_snap * const snap);
-
-//
-// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT
-//
-// Note that because the ring and snapshot are both in host memory and
-// the snapshot blocks progress until freed we can simply point the
-// fake ephemeral snapshot at the ring's durable extent.
-//
-
-struct skc_extent_phrwg_thr1s
-{
-  void * hrw;
-};
-
-struct skc_extent_phrwg_thr1s_snap
-{
-  struct skc_extent_ring_snap * snap;
-
-  struct {
-    skc_uint                    lo;
-    skc_uint                    hi;
-  } count;
-
-  struct {
-    void                      * lo;
-    void                      * hi;
-  } hr1;
-};
-
-void
-skc_extent_phrwg_thr1s_alloc(struct skc_runtime            * const runtime,
-                             struct skc_extent_phrwg_thr1s * const extent,
-                             size_t                          const size);
-
-void
-skc_extent_phrwg_thr1s_free(struct skc_runtime            * const runtime,
-                            struct skc_extent_phrwg_thr1s * const extent);
-
-void
-skc_extent_phrwg_thr1s_snap_init(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_ring             * const ring,
-                                 struct skc_extent_phrwg_thr1s_snap * const snap);
-
-void
-skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime                 * const runtime,
-                                  struct skc_extent_phrwg_thr1s      * const extent,
-                                  struct skc_extent_phrwg_thr1s_snap * const snap);
-
-void
-skc_extent_phrwg_thr1s_snap_free(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phrwg_thr1s_snap * const snap);
-
-//
-// EPHEMERAL MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-#if 0
-struct skc_extent_thrw_tdrw
-{
-  size_t          size;
-  cl_mem          drw;
-  skc_subbuf_id_t id;
-};
-
-void
-skc_extent_thrw_tdrw_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_thrw_tdrw * const extent,
-                           size_t                        const size);
-
-void
-skc_extent_thrw_tdrw_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_thrw_tdrw * const extent);
-
-void *
-skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event);
-
-void *
-skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event);
-
-void
-skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent,
-                           void                        * const hrN,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event);
-#endif
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-struct skc_extent_phrw_pdrw
-{
-  size_t size;
-  cl_mem drw;
-};
-
-void
-skc_extent_phrw_pdrw_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phrw_pdrw * const extent,
-                           size_t                        const size);
-
-void
-skc_extent_phrw_pdrw_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phrw_pdrw * const extent);
-
-void *
-skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event);
-
-void *
-skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event);
-
-void
-skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent,
-                           void                        * const hrN,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event);
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/O   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-struct skc_extent_phrN_pdwN
-{
-  size_t size;
-  cl_mem dwN;
-};
-
-void
-skc_extent_phrN_pdwN_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phrN_pdwN * const extent,
-                           size_t                        const size);
-
-void
-skc_extent_phrN_pdwN_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phrN_pdwN * const extent);
-
-void *
-skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event);
-
-void *
-skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event);
-
-void
-skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent,
-                           void                        * const hrN,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event);
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO W/O   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-struct skc_extent_phwN_pdrN
-{
-  size_t size;
-  cl_mem drN;
-};
-
-void
-skc_extent_phwN_pdrN_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phwN_pdrN * const extent,
-                           size_t                        const size);
-
-void
-skc_extent_phwN_pdrN_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phwN_pdrN * const extent);
-
-void *
-skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event);
-
-void *
-skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event);
-
-void
-skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent,
-                           void                        * const hwm,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event);
-
-//
-//
-//
diff --git a/src/compute/skc/extent_cl_12_unified.c b/src/compute/skc/extent_cl_12_unified.c
deleted file mode 100644
index 69c669ad54..0000000000
--- a/src/compute/skc/extent_cl_12_unified.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT NONE OF THESE EXTENTS CHECK FOR ZERO-SIZED ALLOCATIONS.
-// THAT'S OK FOR NOW.
-//
-
-#include <stdlib.h>
-
-#include "runtime_cl_12.h"
-#include "extent_cl_12.h"
-#include "common/cl/assert_cl.h"
-
-//
-// EPHEMERAL MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-#if 0
-
-#pragma message("struct skc_extent_thrw_tdrw will be removed once the sorter is installed.")
-
-void
-skc_extent_thrw_tdrw_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_thrw_tdrw * const extent,
-                           size_t                        const size)
-{
-  extent->drw = skc_runtime_device_temp_alloc(runtime,
-                                              CL_MEM_READ_WRITE /* | CL_MEM_ALLOC_HOST_PTR */,
-                                              size,&extent->id,&extent->size);
-}
-
-void
-skc_extent_thrw_tdrw_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_thrw_tdrw * const extent)
-{
-  skc_runtime_device_temp_free(runtime,extent->drw,extent->id);
-}
-
-void *
-skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event)
-{
-  cl_int cl_err;
-
-  void * hrw = clEnqueueMapBuffer(cq,extent->drw,
-                                  CL_FALSE,
-                                  CL_MAP_READ | CL_MAP_WRITE,0,size,
-                                  0,NULL,event,&cl_err); cl_ok(cl_err);
-
-  return hrw;
-}
-
-void *
-skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event)
-{
-  return skc_extent_thrw_tdrw_map_size(extent,extent->size,cq,event);
-}
-
-void
-skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent,
-                           void                        * const hrw,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event)
-{
-  cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event));
-}
-
-#endif
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-void
-skc_extent_phrw_pdrw_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phrw_pdrw * const extent,
-                           size_t                        const size)
-{
-  cl_int cl_err;
-
-  extent->size = size;
-  extent->drw  = clCreateBuffer(runtime->cl.context,
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                size,NULL,&cl_err); cl_ok(cl_err);
-}
-
-void
-skc_extent_phrw_pdrw_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phrw_pdrw * const extent)
-{
-  cl(ReleaseMemObject(extent->drw));
-}
-
-void *
-skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event)
-{
-  cl_int cl_err;
-
-  void * hrw = clEnqueueMapBuffer(cq,extent->drw,
-                                  CL_FALSE,
-                                  CL_MAP_READ | CL_MAP_WRITE,0,size,
-                                  0,NULL,event,&cl_err); cl_ok(cl_err);
-
-  return hrw;
-}
-
-void *
-skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event)
-{
-  return skc_extent_phrw_pdrw_map_size(extent,extent->size,cq,event);
-}
-
-void
-skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent,
-                           void                        * const hrw,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event)
-{
-  cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event));
-}
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO R/O   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-void
-skc_extent_phrN_pdwN_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phrN_pdwN * const extent,
-                           size_t                        const size)
-{
-  cl_int cl_err;
-
-  extent->size = size;
-  extent->dwN  = clCreateBuffer(runtime->cl.context,
-                                CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                size,NULL,&cl_err); cl_ok(cl_err);
-}
-
-void
-skc_extent_phrN_pdwN_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phrN_pdwN * const extent)
-{
-  cl(ReleaseMemObject(extent->dwN));
-}
-
-void *
-skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event)
-{
-  cl_int cl_err;
-
-  void * hrN = clEnqueueMapBuffer(cq,extent->dwN,
-                                  CL_FALSE,
-                                  CL_MAP_READ,0,size,
-                                  0,NULL,event,&cl_err); cl_ok(cl_err);
-
-  return hrN;
-}
-
-void *
-skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event)
-{
-  return skc_extent_phrN_pdwN_map_size(extent,extent->size,cq,event);
-}
-
-void
-skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent,
-                           void                        * const hrN,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event)
-{
-  cl(EnqueueUnmapMemObject(cq,extent->dwN,hrN,0,NULL,event));
-}
-
-//
-// DURABLE MAPPING
-//
-// ENTIRE EXTENT   MAPPED TO W/O   HOST MEMORY
-// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY
-//
-// Note: integrated vs. discrete GPUs will have different
-// implementations because we don't want a GPU kernel repeatedly
-// accessing pinned memory.
-//
-
-void
-skc_extent_phwN_pdrN_alloc(struct skc_runtime          * const runtime,
-                           struct skc_extent_phwN_pdrN * const extent,
-                           size_t                        const size)
-{
-  cl_int cl_err;
-
-  extent->size = size;
-  extent->drN  = clCreateBuffer(runtime->cl.context,
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                size,NULL,&cl_err); cl_ok(cl_err);
-}
-
-void
-skc_extent_phwN_pdrN_free(struct skc_runtime          * const runtime,
-                          struct skc_extent_phwN_pdrN * const extent)
-{
-  cl(ReleaseMemObject(extent->drN));
-}
-
-void *
-skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent,
-                              size_t                        const size,
-                              cl_command_queue              const cq,
-                              cl_event                    * const event)
-{
-  cl_int cl_err;
-
-  void * hwN = clEnqueueMapBuffer(cq,extent->drN,
-                                  CL_FALSE,
-                                  CL_MAP_WRITE,0,size,
-                                  0,NULL,event,&cl_err); cl_ok(cl_err);
-
-  return hwN;
-}
-
-void *
-skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent,
-                         cl_command_queue              const cq,
-                         cl_event                    * const event)
-{
-  return skc_extent_phwN_pdrN_map_size(extent,extent->size,cq,event);
-}
-
-void
-skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent,
-                           void                        * const hwN,
-                           cl_command_queue              const cq,
-                           cl_event                    * const event)
-{
-  cl(EnqueueUnmapMemObject(cq,extent->drN,hwN,0,NULL,event));
-}
-
-//
-//
-//
diff --git a/src/compute/skc/fills_expand.cl b/src/compute/skc/fills_expand.cl
deleted file mode 100644
index b6f56794c5..0000000000
--- a/src/compute/skc/fills_expand.cl
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "raster_builder_cl_12.h"
-#include "atomic_cl.h"
-#include "block.h"
-#include "path.h"
-#include "common.h"
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
-#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if   ( SKC_FILLS_EXPAND_X == 1 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_FILLS_EXPAND_X == 2 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_FILLS_EXPAND_X == 4 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_FILLS_EXPAND_X == 8 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_FILLS_EXPAND_X == 16)
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_FILLS_EXPAND_X"
-#endif
-
-//
-// Fill and rasterize cmds only differ in their first word semantics
-//
-
-union skc_cmd_expand
-{
-  union skc_cmd_fill      fill;
-  union skc_cmd_rasterize rasterize;
-};
-
-//
-//
-//
-
-union skc_path_elem
-{
-  skc_uint  u32;
-  skc_float f32;
-};
-
-//
-// COMPILE-TIME AND RUN-TIME MACROS
-//
-
-#define SKC_ELEM_IN_RANGE(X,I)                                          \
-  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_ELEM_GTE(X,I)                                       \
-  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
-  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-//
-//
-
-void
-skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
-                    skc_uint                         * const out_idx,
-                    union skc_cmd_expand             * const cmd,
-                    union skc_path_elem                const e,
-                    skc_uint                           const e_idx)
-{
-  //
-  // FIXME -- we can append a large number of nodeword indices to a
-  // local SMEM queue and flush when full.  It may or may not be a
-  // performance win on some architectures.
-  //
-  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
-  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
-
-  cmd->rasterize.nodeword = e_idx;
-
-  if (is_elem) {
-    cmds_out[*out_idx + offset] = cmd->rasterize;
-  }
-
-  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
-}
-
-//
-//
-//
-
-__kernel
-SKC_FILLS_EXPAND_KERNEL_ATTRIBS
-void
-skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
-                        __global skc_uint                volatile * const atomics,
-                        __global skc_block_id_t          const    * const map,
-                        __global union skc_cmd_fill      const    * const cmds_in,
-                        __global union skc_cmd_rasterize          * const cmds_out)
-{
-  //
-  // Need to harmonize the way we determine a subgroup's id.  In this
-  // kernel it's not as important because no local memory is being
-  // used.  Although the device/mask calc to determine subgroup and
-  // lanes is still proper, we might want to make it clearer that
-  // we're working with subgroups by using the subgroup API.
-  //
-  // every subgroup/simd that will work on the block loads the same command
-  //
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const       cmd_stride = get_num_sub_groups();
-#else
-  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();
-
-  // load fill command -- we reuse y component
-  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };
-
-  // get the path header block from the map
-  skc_block_id_t       id         = map[cmd.fill.path];
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("expand[%u] = %u\n",cmd_idx,id);
-#endif
-
-  //
-  // blindly load all of the head elements into registers
-  //
-  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // pick out count.nodes and count.prims from the header
-  //
-  skc_uint count_nodes, count_prims;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
-    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
-  }                                                                     \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
-    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // debug of path head
-  //
-#if 0
-  skc_uint count_blocks;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
-    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  if (get_sub_group_local_id() == 0)
-    printf("path header = { %5u, %5u, %5u }\n",
-           count_blocks,count_nodes,count_prims);
-#endif
-
-  //
-  // acquire slots in the expanded cmd extent
-  //
-  // decrement prim_idx by 1 so we can use inclusive warp scan later
-  //
-  skc_uint out_idx = 0;
-
-  if (get_sub_group_local_id() == 0) {
-    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
-      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
-  }
-
-  out_idx = sub_group_broadcast(out_idx,0);
-
-  //
-  // process ids trailing the path header
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
-    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
-      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
-        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
-      }                                                                 \
-    }                                                                   \
-    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
-                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // we're done if it was just the header
-  //
-  if (count_nodes == 0)
-    return;
-
-  //
-  // otherwise, process the nodes
-  //
-
-  //
-  // get id of next node
-  //
-  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
-
-  //
-  // the following blocks are nodes
-  //
-  while (true)
-    {
-      // get index of each element
-      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
-      //
-      // blindly load all of the node elements into registers
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-      //
-      // append all valid ids
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
-                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
-
-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-      // any more nodes?
-      if (--count_nodes == 0)
-        return;
-
-      //
-      // get id of next node
-      //
-      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/handle_pool_cl_12.c b/src/compute/skc/handle_pool_cl_12.c
deleted file mode 100644
index 65288c3656..0000000000
--- a/src/compute/skc/handle_pool_cl_12.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdio.h>
-#include <assert.h>
-
-//
-//
-//
-
-#include "common/cl/assert_cl.h"
-
-#include "block.h"
-#include "grid.h"
-#include "config_cl.h"
-#include "runtime_cl_12.h"
-
-//
-// FIXME -- these comments are now quite stale
-//
-//
-// HANDLE/ACQUIRE RELEASE
-//
-// The runtime vends handles just in case we decide to exploit shared
-// virtual memory.  But for most platforms and devices we will have a
-// pool of host-managed handles and on the device there will be a
-// table that maps the host handle to a device-managed memory block.
-//
-// HANDLE READINESS
-//
-// A host handle may reference a path or a raster which is not ready
-// for use further down the pipeline because it hasn't yet been
-// processed by the device.
-//
-// The simplest scheme for providing every handle a readiness state is
-// to build a map that that marks a new handle as being not-ready
-// while being processed by a particular grid id.  When the final
-// sub-pipeline grid responsible for the path or raster is complete,
-// then mark the handle as being ready and eventually return the grid
-// id back to the pool.  This can be performed on a separate thread.
-//
-// The side-benefit of this approach is that a handle's reference
-// count integral type can spare some bits for its associated grid id.
-//
-// A more memory-intensive approach uses a 64-bit epoch+grid key and
-// relies on the ~56 bits of epoch space to avoid any post
-// sub-pipeline status update by assuming that a handle and grid will
-// match or mismatch when queried.
-//
-
-#define SKC_HANDLE_REFCNT_HOST_BITS   (SKC_MEMBER_SIZE(union skc_handle_refcnt,h) * 8)
-#define SKC_HANDLE_REFCNT_DEVICE_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,d) * 8)
-
-#define SKC_HANDLE_REFCNT_HOST_MAX    SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_HOST_BITS)
-#define SKC_HANDLE_REFCNT_DEVICE_MAX  SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_DEVICE_BITS)
-
-//
-//
-//
-
-static
-void
-skc_handle_reclaim_create(struct skc_runtime      * const runtime,
-                          struct skc_handle_pool  * const handle_pool,
-                          skc_handle_reclaim_type_e const reclaim_type,
-                          skc_device_kernel_id      const kernel_id)
-{
-  struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type;
-
-  // init counters
-  reclaim->bih.rem   = 0;
-
-  // acquire kernel
-  reclaim->kernel    = skc_device_acquire_kernel(runtime->device,kernel_id);
-  reclaim->kernel_id = kernel_id;
-
-  // set default args
-  cl(SetKernelArg(reclaim->kernel,0,SKC_CL_ARG(runtime->block_pool.ids.drw)));
-  cl(SetKernelArg(reclaim->kernel,1,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
-  cl(SetKernelArg(reclaim->kernel,2,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
-  cl(SetKernelArg(reclaim->kernel,3,SKC_CL_ARG(runtime->config->block_pool.ring_mask)));
-  cl(SetKernelArg(reclaim->kernel,4,SKC_CL_ARG(runtime->handle_pool.map.drw)));
-}
-
-static
-void
-skc_handle_reclaim_dispose(struct skc_runtime      * const runtime,
-                           skc_handle_reclaim_type_e const reclaim_type)
-{
-  struct skc_handle_reclaim * const reclaim = runtime->handle_pool.reclaim + reclaim_type;
-
-  cl(ReleaseKernel(reclaim->kernel));
-}
-
-//
-//
-//
-
-#define SKC_HANDLE_POOL_BLOCKS_PAD  8
-
-void
-skc_handle_pool_create(struct skc_runtime     * const runtime,
-                       struct skc_handle_pool * const handle_pool,
-                       skc_uint                 const size,
-                       skc_uint                 const width,
-                       skc_uint                 const recs)
-{
-  skc_uint const blocks         = (size + width - 1) / width;
-  skc_uint const blocks_padded  = blocks + SKC_HANDLE_POOL_BLOCKS_PAD;
-  skc_uint const handles        = blocks        * width;
-  skc_uint const handles_padded = blocks_padded * width;
-  skc_uint const recs_padded    = recs + 2; // one for pointer and one for head node
-
-  skc_extent_pdrw_alloc(runtime,&handle_pool->map,handles * sizeof(skc_block_id_t));
-
-  handle_pool->handle.indices   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles_padded * sizeof(*handle_pool->handle.indices));
-  handle_pool->handle.refcnts   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles        * sizeof(*handle_pool->handle.refcnts));
-  handle_pool->block.indices    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,blocks_padded  * sizeof(*handle_pool->block.indices));
-  handle_pool->recs             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,recs_padded    * sizeof(*handle_pool->recs));
-
-  // initialize handles and refcnts
-  for (skc_uint ii=0; ii<handles; ii++)
-    handle_pool->handle.indices[ii] = ii;
-
-  for (skc_uint ii=0; ii<handles; ii++)
-    handle_pool->handle.refcnts[ii].hd = 0;
-
-  handle_pool->handle.count     = handles;
-
-  // initialize block accounting
-  for (skc_uint ii=0; ii<blocks_padded; ii++)
-    handle_pool->block.indices[ii] = ii;
-
-  handle_pool->block.count      = blocks_padded;
-  handle_pool->block.width      = width;
-
-  handle_pool->block.tos        = blocks; // pop = pre-decrement  / push = post-increment
-  handle_pool->block.bos        = blocks; // pop = post-increment / push = pre-decrement
-
-  // initialize recs -- first two elements are interpreted differently
-  handle_pool->recs[0].runtime  = runtime;
-  handle_pool->recs[1]          = (union skc_handle_reclaim_rec){ .rem = recs, .head = 2 };
-
-  for (skc_uint ii=2; ii<recs_padded; ii++)
-    handle_pool->recs[ii] = (union skc_handle_reclaim_rec){ .index = ii, .next = ii+1 };
-
-  handle_pool->recs[recs_padded-1].next = SKC_UINT_MAX;
-
-  // initialize acquire
-  handle_pool->acquire.rem = 0;
-
-  // create reclaimers
-  skc_handle_reclaim_create(runtime,
-                            handle_pool,
-                            SKC_HANDLE_RECLAIM_TYPE_PATH,
-                            SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM);
-
-  skc_handle_reclaim_create(runtime,
-                            handle_pool,
-                            SKC_HANDLE_RECLAIM_TYPE_RASTER,
-                            SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM);
-}
-
-//
-//
-//
-
-void
-skc_handle_pool_dispose(struct skc_runtime     * const runtime,
-                        struct skc_handle_pool * const handle_pool)
-{
-  skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER);
-  skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH);
-
-  skc_runtime_host_perm_free(runtime,handle_pool->recs);
-  skc_runtime_host_perm_free(runtime,handle_pool->block.indices);
-  skc_runtime_host_perm_free(runtime,handle_pool->handle.refcnts);
-  skc_runtime_host_perm_free(runtime,handle_pool->handle.indices);
-
-  skc_extent_pdrw_free(runtime,&handle_pool->map);
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_handle_pool_block_readable_pop(struct skc_runtime     * const runtime,
-                                   struct skc_handle_pool * const handle_pool)
-{
-  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.tos == 0);
-
-  skc_uint const index = handle_pool->block.indices[--handle_pool->block.tos];
-
-#if 0
-  skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width;
-  for (skc_uint ii=0; ii<handle_pool->block.width; ii++)
-    printf("R-: %u\n",*--handles);
-#endif
-
-  return index;
-}
-
-static
-void
-skc_handle_pool_block_readable_push(struct skc_handle_pool * const handle_pool,
-                                    skc_uint                 const index)
-{
-  handle_pool->block.indices[handle_pool->block.tos++] = index;
-
-#if 0
-  skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width;
-  for (skc_uint ii=0; ii<handle_pool->block.width; ii++)
-    printf("R+: %u\n",*--handles);
-#endif
-}
-
-
-static
-skc_uint
-skc_handle_pool_block_writable_pop(struct skc_runtime     * const runtime,
-                                   struct skc_handle_pool * const handle_pool)
-{
-  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.bos == handle_pool->block.count);
-
-  return handle_pool->block.indices[handle_pool->block.bos++];
-}
-
-static
-void
-skc_handle_pool_block_writable_push(struct skc_handle_pool * const handle_pool,
-                                    skc_uint                 const block_idx)
-{
-  handle_pool->block.indices[--handle_pool->block.bos] = block_idx;
-}
-
-//
-// May need to acquire the path or raster handle *early* just to be
-// sure one exists
-//
-
-skc_handle_t
-skc_runtime_handle_device_acquire(struct skc_runtime * const runtime)
-{
-  struct skc_handle_pool * const handle_pool = &runtime->handle_pool;
-
-  // acquire a block of handles at a time
-  if (handle_pool->acquire.rem == 0)
-    {
-      skc_uint const block_idx = skc_handle_pool_block_readable_pop(runtime,handle_pool);
-
-      handle_pool->acquire.block   = block_idx;
-      handle_pool->acquire.rem     = handle_pool->block.width;
-      handle_pool->acquire.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width;
-    }
-
-  // load handle from next block slot
-  skc_uint     const rem    =  --handle_pool->acquire.rem;
-  skc_handle_t const handle = *--handle_pool->acquire.handles;
-
-  // initialize refcnt for handle
-  handle_pool->handle.refcnts[handle] = (union skc_handle_refcnt){ .h = 1, .d = 1 };
-
-  // if this was the last handle in the block then move the block id
-  // to the reclamation stack to be used as a scratchpad
-  if (rem == 0) {
-    skc_handle_pool_block_writable_push(handle_pool,handle_pool->acquire.block);
-  }
-
-  return handle;
-}
-
-//
-//
-//
-
-static
-void
-skc_handle_reclaim_completion(union skc_handle_reclaim_rec * const recN)
-{
-  // get root rec which contains pointer to runtime
-  union skc_handle_reclaim_rec * const rec0 = recN - recN->index;
-  union skc_handle_reclaim_rec * const rec1 = rec0 + 1;
-
-  // return block for reading
-  skc_handle_pool_block_readable_push(&rec0->runtime->handle_pool,recN->block);
-
-  // recN is new head of list
-  recN->next = rec1->head;
-  rec1->head = recN->index;
-  rec1->rem += 1;
-}
-
-static
-void
-skc_handle_reclaim_cb(cl_event event, cl_int status, union skc_handle_reclaim_rec * const recN)
-{
-  SKC_CL_CB(status);
-
-  union skc_handle_reclaim_rec * const rec0 = recN - recN->index;
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(rec0->runtime->scheduler,skc_handle_reclaim_completion,recN);
-}
-
-//
-// FIXME -- is there an issue launching on the host thread?
-//
-
-static
-void
-skc_handle_reclaim_launch(struct skc_runtime            * const runtime,
-                          struct skc_handle_pool        * const handle_pool,
-                          struct skc_handle_reclaim     * const reclaim,
-                          union  skc_handle_reclaim_rec * const recN)
-{
-  cl(SetKernelArg(reclaim->kernel,
-                  5,
-                  handle_pool->block.width * sizeof(skc_handle_t),
-                  reclaim->bih.handles));
-
-  // acquire a cq
-  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
-
-  cl_event complete;
-
-  // the kernel grid is shaped by the target device
-  skc_device_enqueue_kernel(runtime->device,
-                            reclaim->kernel_id,
-                            cq,
-                            reclaim->kernel,
-                            handle_pool->block.width,
-                            0,NULL,&complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_handle_reclaim_cb,recN));
-  cl(ReleaseEvent(complete));
-
-  // kickstart kernel execution
-  cl(Flush(cq));
-
-  // release the cq
-  skc_runtime_release_cq_in_order(runtime,cq);
-}
-
-//
-// reclaim a handle
-//
-
-static
-union skc_handle_reclaim_rec *
-skc_handle_acquire_reclaim_rec(struct skc_runtime     * const runtime,
-                               struct skc_handle_pool * const handle_pool)
-{
-  union skc_handle_reclaim_rec * const rec1 = handle_pool->recs + 1;
-
-  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,rec1->rem == 0);
-
-  union skc_handle_reclaim_rec * const recN = handle_pool->recs + rec1->head;
-
-  rec1->head = recN->next;
-  rec1->rem -= 1;
-
-  // fprintf(stderr,"rec1->rem = %u\n",rec1->rem);
-
-  return recN;
-}
-
-static
-void
-skc_runtime_device_reclaim(struct skc_runtime        * const runtime,
-                           struct skc_handle_pool    * const handle_pool,
-                           struct skc_handle_reclaim * const reclaim,
-                           skc_handle_t                const handle)
-{
-  // grab a new block?
-  if (reclaim->bih.rem == 0)
-    {
-      skc_uint const block_idx = skc_handle_pool_block_writable_pop(runtime,handle_pool);
-
-      reclaim->bih.block   = block_idx;
-      reclaim->bih.rem     = handle_pool->block.width;
-      reclaim->bih.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width;
-    }
-
-  // store handle -- handle's refcnt was already set to {0:0}
-  *--reclaim->bih.handles = handle;
-
-  // if block is full then launch reclamation kernel
-  if (--reclaim->bih.rem == 0)
-    {
-      union skc_handle_reclaim_rec * recN = skc_handle_acquire_reclaim_rec(runtime,handle_pool);
-
-      recN->block = reclaim->bih.block;
-
-      skc_handle_reclaim_launch(runtime,handle_pool,reclaim,recN);
-    }
-}
-
-//
-// Validate host-provided handles before retaining.
-//
-// Retain validation consists of:
-//
-//   - correct handle type
-//   - handle is in range of pool
-//   - host refcnt is not zero
-//   - host refcnt is not at the maximum value
-//
-// After validation, retain the handles for the host
-//
-
-static
-skc_err
-skc_runtime_handle_host_validated_retain(struct skc_runtime       * const runtime,
-                                         skc_typed_handle_type_e    const handle_type,
-                                         skc_typed_handle_t const * const typed_handles,
-                                         uint32_t                   const count)
-{
-  //
-  // FIXME -- test to make sure handles aren't completely out of range integers
-  //
-
-  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
-
-  for (skc_uint ii=0; ii<count; ii++)
-    {
-      skc_typed_handle_t const typed_handle = typed_handles[ii];
-
-      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,handle_type))
-        {
-          return SKC_ERR_HANDLE_INVALID;
-        }
-      else
-        {
-          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
-
-          if (handle >= runtime->handle_pool.handle.count)
-            {
-              return SKC_ERR_HANDLE_INVALID;
-            }
-          else
-            {
-              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
-              skc_uint                  const host       = refcnt_ptr->h;
-
-              if (host == 0)
-                {
-                  return SKC_ERR_HANDLE_INVALID;
-                }
-              else if (host == SKC_HANDLE_REFCNT_HOST_MAX)
-                {
-                  return SKC_ERR_HANDLE_OVERFLOW;
-                }
-            }
-        }
-    }
-
-  //
-  // all the handles validated, so retain them all..
-  //
-  for (skc_uint ii=0; ii<count; ii++)
-    refcnts[SKC_TYPED_HANDLE_TO_HANDLE(typed_handles[ii])].h++;
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_path_host_retain(struct skc_runtime * const runtime,
-                             skc_path_t   const *       paths,
-                             uint32_t                   count)
-{
-  return skc_runtime_handle_host_validated_retain(runtime,
-                                                  SKC_TYPED_HANDLE_TYPE_IS_PATH,
-                                                  paths,
-                                                  count);
-}
-
-skc_err
-skc_runtime_raster_host_retain(struct skc_runtime * const runtime,
-                               skc_path_t   const *       rasters,
-                               uint32_t                   count)
-{
-  return skc_runtime_handle_host_validated_retain(runtime,
-                                                  SKC_TYPED_HANDLE_TYPE_IS_RASTER,
-                                                  rasters,
-                                                  count);
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_raster_host_flush(struct skc_runtime * const runtime,
-                               skc_raster_t const *       rasters,
-                               uint32_t                   count)
-{
-  skc_grid_deps_force(runtime->deps,rasters,count);
-
-  return SKC_ERR_SUCCESS;
-}
-
-skc_err
-skc_runtime_path_host_flush(struct skc_runtime * const runtime,
-                            skc_path_t   const *       paths,
-                            uint32_t                   count)
-{
-  skc_grid_deps_force(runtime->deps,paths,count);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-// Validate host-provided handles before releasing.
-//
-// Release validation consists of:
-//
-//   - correct handle type
-//   - handle is in range of pool
-//   - host refcnt is not zero
-//
-// After validation, release the handles for the host
-//
-
-static
-skc_err
-skc_runtime_host_validated_release(struct skc_runtime       * const runtime,
-                                   skc_typed_handle_type_e    const type,
-                                   skc_handle_reclaim_type_e  const reclaim_type,
-                                   skc_typed_handle_t const * const handles,
-                                   uint32_t                   const count)
-{
-  struct skc_handle_pool   * const handle_pool = &runtime->handle_pool;
-  union  skc_handle_refcnt * const refcnts     = handle_pool->handle.refcnts;
-
-  for (skc_uint ii=0; ii<count; ii++)
-    {
-      skc_typed_handle_t const typed_handle = handles[ii];
-
-      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type))
-        {
-          return SKC_ERR_HANDLE_INVALID;
-        }
-      else
-        {
-          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
-
-          if (handle >= handle_pool->handle.count)
-            {
-              return SKC_ERR_HANDLE_INVALID;
-            }
-          else
-            {
-              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
-              skc_uint                  const host       = refcnt_ptr->h;
-
-              if (host == 0)
-                {
-                  return SKC_ERR_HANDLE_INVALID;
-                }
-            }
-        }
-    }
-
-  //
-  // all the handles validated, so release them all..
-  //
-  struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type;
-
-  for (skc_uint ii=0; ii<count; ii++)
-    {
-      skc_handle_t              const handle     = SKC_TYPED_HANDLE_TO_HANDLE(handles[ii]);
-      union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
-      union skc_handle_refcnt         refcnt     = *refcnt_ptr;
-
-      refcnt.h   -= 1;
-      *refcnt_ptr = refcnt;
-
-      if (refcnt.hd == 0) {
-        skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle);
-      }
-    }
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_path_host_release(struct skc_runtime * const runtime,
-                              skc_path_t   const *       paths,
-                              uint32_t                   count)
-{
-  return skc_runtime_host_validated_release(runtime,
-                                            SKC_TYPED_HANDLE_TYPE_IS_PATH,
-                                            SKC_HANDLE_RECLAIM_TYPE_PATH,
-                                            paths,
-                                            count);
-}
-
-skc_err
-skc_runtime_raster_host_release(struct skc_runtime * const runtime,
-                                skc_raster_t const *       rasters,
-                                uint32_t                   count)
-{
-  return skc_runtime_host_validated_release(runtime,
-                                            SKC_TYPED_HANDLE_TYPE_IS_RASTER,
-                                            SKC_HANDLE_RECLAIM_TYPE_RASTER,
-                                            rasters,
-                                            count);
-}
-
-//
-// Validate host-provided handles before retaining on the device.
-//
-//   - correct handle type
-//   - handle is in range of pool
-//   - host refcnt is not zero
-//   - device refcnt is not at the maximum value
-//
-
-skc_err
-skc_runtime_handle_device_validate_retain(struct skc_runtime       * const runtime,
-                                          skc_typed_handle_type_e    const type,
-                                          skc_typed_handle_t const *       handles,
-                                          uint32_t                         count)
-{
-  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
-
-  while (count-- > 0)
-    {
-      skc_typed_handle_t const typed_handle = *handles++;
-
-      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type))
-        {
-          return SKC_ERR_HANDLE_INVALID;
-        }
-      else
-        {
-          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
-
-          if (handle >= runtime->handle_pool.handle.count)
-            {
-              return SKC_ERR_HANDLE_INVALID;
-            }
-          else
-            {
-              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
-              union skc_handle_refcnt         refcnt     = *refcnt_ptr;
-
-              if (refcnt.h == 0)
-                {
-                  return SKC_ERR_HANDLE_INVALID;
-                }
-              else if (refcnt.d == SKC_HANDLE_REFCNT_DEVICE_MAX)
-                {
-                  return SKC_ERR_HANDLE_OVERFLOW;
-                }
-            }
-        }
-    }
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-// After validation, retain the handles for the device
-//
-
-void
-skc_runtime_handle_device_retain(struct skc_runtime * const runtime,
-                                 skc_handle_t const *       handles,
-                                 uint32_t                   count)
-{
-  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
-
-  while (count-- > 0)
-    refcnts[SKC_TYPED_HANDLE_TO_HANDLE(*handles++)].d++;
-}
-
-//
-// Release the device-held handles -- no validation required!
-//
-
-static
-void
-skc_runtime_handle_device_release(struct skc_runtime      * const runtime,
-                                  skc_handle_reclaim_type_e const reclaim_type,
-                                  skc_handle_t      const *       handles,
-                                  skc_uint                        count)
-{
-  struct skc_handle_pool    * const handle_pool = &runtime->handle_pool;
-  union  skc_handle_refcnt  * const refcnts     = handle_pool->handle.refcnts;
-  struct skc_handle_reclaim * const reclaim     = handle_pool->reclaim + reclaim_type;
-
-  while (count-- > 0) {
-    skc_handle_t              const handle     = *handles++;
-    union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
-    union skc_handle_refcnt         refcnt     = *refcnt_ptr;
-
-    refcnt.d   -= 1;
-    *refcnt_ptr = refcnt;
-
-#if 0
-    printf("%8u = { %u, %u }\n",handle,refcnt.h,refcnt.d);
-#endif
-
-    if (refcnt.hd == 0) {
-      skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle);
-    }
-  }
-}
-
-//
-//
-//
-
-void
-skc_runtime_path_device_release(struct skc_runtime * const runtime,
-                                skc_handle_t const *       handles,
-                                skc_uint                   count)
-{
-  skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH,handles,count);
-}
-
-void
-skc_runtime_raster_device_release(struct skc_runtime * const runtime,
-                                  skc_handle_t const *       handles,
-                                  skc_uint                   count)
-{
-  skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER,handles,count);
-}
-
-//
-//
-//
diff --git a/src/compute/skc/handle_pool_cl_12.h b/src/compute/skc/handle_pool_cl_12.h
deleted file mode 100644
index 4fefae3552..0000000000
--- a/src/compute/skc/handle_pool_cl_12.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "macros.h"
-#include "handle.h"
-#include "extent_cl_12.h"
-#include "device_cl_12.h"
-
-//
-// FIXME -- THIS DOCUMENTATION IS STALE NOW THAT A REFERENCE COUNT REP
-// IS A {HOST:DEVICE} PAIR.
-//
-// Host-side handle pool
-//
-// The bulk size of the three extents is currently 6 bytes of overhead
-// per number of host handles.  The number of host handles is usually
-// less than the number of blocks in the pool.  Note that the maximum
-// number of blocks is 2^27.
-//
-// A practical instantiation might provide a combined 2^20 path and
-// raster host handles. This would occupy 6 MB of host RAM for the
-// 32-bit handle, 8-bit reference count and 8-bit handle-to-grid map.
-//
-// Also note that we could use isolated/separate path and raster block
-// pools. Worst case, this would double the memory footprint of SKC.
-//
-// Host-side handle reference count
-//
-//   [0      ] : release
-//   [1..UMAX] : retain
-//
-// In a garbage-collected environment we might want to rely on an
-// existing mechanism for determing whether a handle is live.
-//
-// Otherwise, we probably want to have a 16 or 32-bit ref count.
-//
-// The handle reference count is defensive and will not allow the host
-// to underflow a handle that's still retained by the pipeline.
-//
-// The single reference counter is split into host and device counts.
-//
-
-union skc_handle_refcnt
-{
-  skc_ushort  hd; // host and device
-
-  struct {
-    skc_uchar h;  // host
-    skc_uchar d;  // device
-  };
-};
-
-SKC_STATIC_ASSERT(SKC_MEMBER_SIZE(union skc_handle_refcnt,hd) ==
-                  SKC_MEMBER_SIZE(union skc_handle_refcnt,h) +
-                  SKC_MEMBER_SIZE(union skc_handle_refcnt,d));
-
-//
-//
-//
-
-struct skc_handle_bih
-{
-  skc_uint       block;
-  skc_uint       rem;
-  skc_handle_t * handles;
-};
-
-struct skc_handle_reclaim
-{
-  struct skc_handle_bih bih;
-
-  cl_kernel             kernel;
-  skc_device_kernel_id  kernel_id;
-};
-
-union skc_handle_reclaim_rec
-{
-  // ELEMENT  0
-  struct skc_runtime * runtime;
-  
-  // ELEMENT  1
-  struct {
-    skc_uint           rem;   // # of available records
-    skc_uint           head;  // index of first record
-  };
-
-  // ELEMENTS 2+
-  struct {
-    skc_uint           index; // index of this record -- never modified
-    union {
-      skc_uint         next;  // index of next record
-      skc_uint         block; // block index of reclaimed handles
-    };
-  };
-};
-
-SKC_STATIC_ASSERT(sizeof(union skc_handle_reclaim_rec) == sizeof(skc_uint2));
-
-//
-//
-//
-
-typedef enum skc_handle_reclaim_type_e {
-
-  SKC_HANDLE_RECLAIM_TYPE_PATH,
-  SKC_HANDLE_RECLAIM_TYPE_RASTER,
-
-  SKC_HANDLE_RECLAIM_TYPE_COUNT
-
-} skc_handle_reclaim_type_e;
-
-struct skc_handle_pool
-{
-  //
-  // FIXME -- should we be pedantic and make these always-host-side
-  // allocations "extents" as well?  I think it's OK not being an
-  // extent structure for now and is mostly consistent with the rest
-  // of the code.
-  //
-  // FIXME -- the cbs[] array is a little idiosyncratic but the intent
-  // is to avoid storing the 64-bit backpointer inside of every single
-  // record.  This can be harmonized later.  Note that only a few
-  // hundred outstanding callbacks would represent many many subgroups
-  // of work and would fully occupy the GPU (if we allow it).
-  //
-  //
-  struct skc_extent_pdrw         map;     // device-managed extent mapping a host handle to device block id
-
-  struct {
-    skc_handle_t               * indices; // array of individual host handles -- fragmented into blocks
-    union skc_handle_refcnt    * refcnts; // array of reference counts indexed by an individual handle
-    skc_uint                     count;
-  } handle;
-
-  struct {
-    skc_uint                   * indices; // stack of indices to fixed-size blocks of host handles
-    skc_uint                     count;   // number of handles -- valid from [0,size)
-    skc_uint                     width;   // width of a fixed-size block of handles
-    skc_uint                     tos;     // grows upward   / push++ / --pop / # fixed-size blocks for reading
-    skc_uint                     bos;     // grows downward / --push / pop++ / # fixed-size blocks for writing
-  } block;
-
-  union skc_handle_reclaim_rec * recs;    // array of reclaim records
-
-  struct skc_handle_bih          acquire;
-  struct skc_handle_reclaim      reclaim[SKC_HANDLE_RECLAIM_TYPE_COUNT];
-};
-
-//
-//
-//
-
-void
-skc_handle_pool_create(struct skc_runtime     * const runtime,
-                       struct skc_handle_pool * const handle_pool,
-                       skc_uint                 const size,
-                       skc_uint                 const width,
-                       skc_uint                 const recs);
-
-void
-skc_handle_pool_dispose(struct skc_runtime     * const runtime,
-                        struct skc_handle_pool * const handle_pool);
-
-//
-//
-//
diff --git a/src/compute/skc/interop.c b/src/compute/skc/interop.c
deleted file mode 100644
index 6697bb7e83..0000000000
--- a/src/compute/skc/interop.c
+++ /dev/null
@@ -1,629 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <glad/glad.h>
-#include <glfw/glfw3.h>
-
-//
-//
-//
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <math.h>
-
-//
-//
-//
-
-#include "common/cl/assert_cl.h"
-#include "types.h"
-
-//
-//
-//
-
-#include "interop.h"
-#include "context.h"
-#include "runtime_cl_12.h"
-
-//
-//
-//
-
-#include "svg2skc/transform_stack.h"
-
-//
-//
-//
-
-#if 1
-#define SKC_IMAGE_FORMAT GL_RGBA8
-#else
-#define SKC_IMAGE_FORMAT GL_RGBA16F
-#endif
-
-//
-//
-//
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-//
-//
-//
-
-struct skc_interop_fb
-{
-  cl_context context;
-
-  GLuint     fbo;
-  GLuint     rbo;
-
-  cl_mem     mem;
-
-  int        width;
-  int        height;
-
-  bool       is_srgb;
-  bool       is_vsync_on;
-  bool       is_fullscreen;
-  bool       is_iconified;
-  bool       is_resized;
-  bool       is_spinning;
-  bool       is_info;
-
-  skc_float  scale;
-  skc_float2 translate;
-  float      rotate_theta;
-};
-
-static struct skc_interop_fb fb =
-  {
-    .mem           = NULL,
-
-    .is_srgb       = true,
-    .is_vsync_on   = false,
-    .is_fullscreen = false,
-    .is_iconified  = false,
-    .is_resized    = true,
-    .is_spinning   = false,
-    .is_info       = false,
-
-    .scale         = 1.0f,
-    .translate     = { 0.0f, 0.0f },
-    .rotate_theta  = 0.0f
-  };
-
-//
-// FPS COUNTER FROM HERE:
-//
-// http://antongerdelan.net/opengl/glcontext2.html
-//
-
-static
-void
-skc_interop_fps(GLFWwindow * window)
-{
-  if (fb.is_fullscreen)
-    return;
-
-  // static fps counters
-  static double stamp_prev  = 0.0;
-  static int    frame_count = 0;
-
-  // locals
-  double const  stamp_curr  = glfwGetTime();
-  double const  elapsed     = stamp_curr - stamp_prev;
-
-  if (elapsed >= 0.5)
-    {
-      stamp_prev = stamp_curr;
-
-      double const fps = (double)frame_count / elapsed;
-
-      char tmp[64];
-
-      sprintf_s(tmp,64,"(%d x %d) - VSync %s - sRGB %s - FPS: %.2f",
-                fb.width,fb.height,
-                fb.is_vsync_on ? "ON"      : "OFF",
-                fb.is_srgb     ? "ENABLED" : "DISABLED",
-                fps);
-
-      glfwSetWindowTitle(window,tmp);
-
-      frame_count = 0;
-    }
-
-  frame_count++;
-}
-
-//
-// INITIALIZE GLFW/GLAD
-//
-
-static
-void
-skc_interop_error_callback(int error, char const * description)
-{
-  fputs(description,stderr);
-}
-
-//
-//
-//
-
-static
-void
-skc_interop_iconify_callback(GLFWwindow * window, int iconified)
-{
-  fb.is_iconified = iconified;
-}
-
-//
-//
-//
-
-static
-void
-skc_interop_key_callback(GLFWwindow * window, int key, int scancode, int action, int mods)
-{
-  if (action == GLFW_RELEASE)
-    return;
-
-  switch (key)
-    {
-    case GLFW_KEY_EQUAL:
-      fb.rotate_theta = 0.0f;
-      break;
-
-    case GLFW_KEY_I:
-      fb.is_info = true;
-      break;
-
-    case GLFW_KEY_R:
-      fb.is_spinning ^= true;
-      break;
-
-    case GLFW_KEY_S:
-      fb.is_srgb ^= true;
-      if (fb.is_srgb)
-        glEnable(GL_FRAMEBUFFER_SRGB);
-      else
-        glDisable(GL_FRAMEBUFFER_SRGB);
-      break;
-
-    case GLFW_KEY_V:
-      fb.is_vsync_on ^= true;
-      glfwSwapInterval(fb.is_vsync_on ? 1 : 0);
-      break;
-
-    case GLFW_KEY_W:
-      glfwSetWindowSize(window,1024,1024);
-      break;
-
-    case GLFW_KEY_ESCAPE:
-      glfwSetWindowShouldClose(window,GL_TRUE);
-      break;
-    }
-}
-
-static
-void
-skc_interop_window_size_callback(GLFWwindow * window, int width, int height)
-{
-  fb.width      = width;
-  fb.height     = height;
-  fb.is_resized = true;
-
-#if 0
-  skc_render_kernel_set_clip(0,0,width,height);
-#endif
-}
-
-static
-void
-skc_interop_scale(double const scale_offset)
-{
-#define SKC_SCALE_FACTOR 1.05
-
-  static double scale_exp = 0.0;
-
-  scale_exp += scale_offset;
-  fb.scale   = (float)pow(SKC_SCALE_FACTOR,scale_exp);
-}
-
-static
-void
-skc_interop_scroll_callback(GLFWwindow * window, double xoffset, double yoffset)
-{
-  bool const ctrl =
-    (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL)  == GLFW_PRESS) ||
-    (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS);
-
-  if (!ctrl)
-    return;
-
-  skc_interop_scale(yoffset);
-}
-
-static
-void
-skc_interop_translate(float const dx, float const dy)
-{
-  float const dx_scaled = dx / fb.scale;
-  float const dy_scaled = dy / fb.scale;
-
-  float const cos_theta = cosf(fb.rotate_theta); // replace with cospi if available
-  float const sin_theta = sinf(fb.rotate_theta); // replace with sinpi if available
-
-  fb.translate.x += dx_scaled*cos_theta + dy_scaled*sin_theta;
-  fb.translate.y += dy_scaled*cos_theta - dx_scaled*sin_theta;
-}
-
-static
-void
-skc_interop_cursor_position_callback(GLFWwindow * window, double x, double y)
-{
-  int const state = glfwGetMouseButton(window,GLFW_MOUSE_BUTTON_LEFT);
-
-  static bool  is_mouse_dragging = false;
-  static float x_prev=0.0, y_prev=0.0;
-
-  float const mx = (float)x;
-  float const my = (float)y;
-
-  if (state == GLFW_PRESS)
-    {
-      if (is_mouse_dragging)
-        {
-          const bool ctrl =
-            (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL)  == GLFW_PRESS) ||
-            (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS);
-
-          if (ctrl)
-            {
-              float const cx  = 0.5f * fb.width;
-              float const cy  = 0.5f * fb.height;
-
-              // find angle between mouse and center
-              float const vx  = x_prev - cx;
-              float const vy  = y_prev - cy;
-
-              float const wx  = mx - cx;
-              float const wy  = my - cy;
-
-              float const len = sqrtf((vx*vx + vy*vy) * (wx*wx + wy*wy));
-
-              if (len > 0.0f)
-                {
-                  float const dot = vx*wx + vy*wy;
-                  float const da  = acosf(dot / len);
-
-                  if (vx*wy - vy*wx >= 0.0f)
-                    fb.rotate_theta += da;
-                  else
-                    fb.rotate_theta -= da;
-
-                  fb.rotate_theta = fmodf(fb.rotate_theta,(float)(M_PI*2.0));
-                }
-            }
-          else
-            {
-              skc_interop_translate(mx - x_prev,
-                                    my - y_prev);
-            }
-        }
-      else
-        {
-          is_mouse_dragging = true;
-        }
-
-      x_prev = mx;
-      y_prev = my;
-    }
-  else
-    {
-      is_mouse_dragging = false;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_interop_resize()
-{
-  fb.is_resized = false;
-
-  // release the image2d
-  if (fb.mem != NULL)
-    cl(ReleaseMemObject(fb.mem));
-
-  // resize rbo
-  glNamedRenderbufferStorage(fb.rbo,
-                             SKC_IMAGE_FORMAT,
-                             fb.width,
-                             fb.height);
-
-  // attach rbo to fbo
-  glNamedFramebufferRenderbuffer(fb.fbo,
-                                 GL_COLOR_ATTACHMENT0,
-                                 GL_RENDERBUFFER,
-                                 fb.rbo);
-  //
-  //
-  //
-  cl_int cl_err;
-
-  fb.mem = clCreateFromGLRenderbuffer(fb.context,
-                                      CL_MEM_WRITE_ONLY,
-                                      fb.rbo,
-                                      &cl_err); cl_ok(cl_err);
-  //
-  // for debugging porpoises!
-  //
-  cl_image_format format;
-
-  cl(GetImageInfo(fb.mem,
-                  CL_IMAGE_FORMAT,
-                  sizeof(format),
-                  &format,
-                  NULL));
-}
-
-//
-//
-//
-
-static
-void
-skc_interop_acquire()
-{
-  // frame buffer object
-  glCreateFramebuffers(1,&fb.fbo);
-
-  // render buffer object w/a color buffer
-  glCreateRenderbuffers(1,&fb.rbo);
-
-  // size rbo
-  glNamedRenderbufferStorage(fb.rbo,
-                             SKC_IMAGE_FORMAT,
-                             fb.width,
-                             fb.height);
-
-  // attach rbo to fbo
-  glNamedFramebufferRenderbuffer(fb.fbo,
-                                 GL_COLOR_ATTACHMENT0,
-                                 GL_RENDERBUFFER,
-                                 fb.rbo);
-}
-
-void
-skc_interop_register(skc_context_t context)
-{
-  fb.context = context->runtime->cl.context;
-}
-
-//
-//
-//
-
-void
-skc_interop_init(GLFWwindow * * window)
-{
-  //
-  // INITIALIZE GLFW/GLAD
-  //
-  glfwSetErrorCallback(skc_interop_error_callback);
-
-  if (!glfwInit())
-    exit(EXIT_FAILURE);
-
-  GLFWmonitor       * const primary = glfwGetPrimaryMonitor();
-  GLFWvidmode const * const mode    = glfwGetVideoMode(primary);
-
-  if (fb.is_fullscreen)
-    {
-      fb.width  = mode->width;
-      fb.height = mode->height;
-    }
-  else
-    {
-      fb.width  = 1600;
-      fb.height = 1024;
-    }
-
-  glfwWindowHint(GLFW_ALPHA_BITS,            0);
-  glfwWindowHint(GLFW_DEPTH_BITS,            0);
-  glfwWindowHint(GLFW_STENCIL_BITS,          0);
-
-  glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);
-
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
-
-  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);
-
-  *window = glfwCreateWindow(fb.width,fb.height,
-                             "Skia Compute",
-                             fb.is_fullscreen ? primary : NULL,
-                             NULL);
-
-  if (*window == NULL)
-    {
-      glfwTerminate();
-      exit(EXIT_FAILURE);
-    }
-
-  glfwMakeContextCurrent(*window);
-
-  // set up GLAD
-  gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
-
-  // ignore vsync for now
-  glfwSwapInterval(fb.is_vsync_on ? 1 : 0);
-
-  // only copy r/g/b
-  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
-
-  // enable SRGB, disable scissor
-  glEnable(GL_FRAMEBUFFER_SRGB);
-  glDisable(GL_SCISSOR_TEST);
-
-  //
-  // SET USER POINTER AND CALLBACKS
-  //
-  glfwSetKeyCallback            (*window,skc_interop_key_callback);
-  glfwSetFramebufferSizeCallback(*window,skc_interop_window_size_callback);
-  glfwSetScrollCallback         (*window,skc_interop_scroll_callback);
-  glfwSetCursorPosCallback      (*window,skc_interop_cursor_position_callback);
-  glfwSetWindowIconifyCallback  (*window,skc_interop_iconify_callback);
-
-  //
-  //
-  //
-  fprintf(stderr,
-          "GL_VENDOR   : %s\n"
-          "GL_RENDERER : %s\n",
-          glGetString(GL_VENDOR),
-          glGetString(GL_RENDERER));
-
-  //
-  // acquire an FBO/RBO
-  //
-  skc_interop_acquire();
-}
-
-//
-//
-//
-
-#define SKC_ROTATE_STEP ((float)(M_PI / 180.0))
-
-static
-void
-skc_interop_transform(struct skc_transform_stack * ts)
-{
-  // OpenGL'ism
-  skc_transform_stack_push_affine(ts,
-                                  1.0f, 0.0f,0.0f,
-                                  0.0f,-1.0f,(float)fb.height);
-  // multiply
-  skc_transform_stack_concat(ts);
-
-  // spinner...
-  if (fb.is_spinning)
-    fb.rotate_theta = fmodf(fb.rotate_theta + SKC_ROTATE_STEP,(float)(M_PI*2.0));
-  
-  // always rotate and scale around surface center point
-  skc_transform_stack_push_rotate_scale_xy(ts,
-                                           fb.rotate_theta,
-                                           fb.scale,fb.scale,
-                                           0.5f*fb.width,0.5f*fb.height);
-  skc_transform_stack_concat(ts);
-
-  // where did the mouse take us?
-  skc_transform_stack_push_translate(ts,
-                                     fb.translate.x,fb.translate.y);
-  skc_transform_stack_concat(ts);
-}
-
-
-void
-skc_interop_poll(GLFWwindow                 * window,
-                 struct skc_transform_stack * ts)
-{
-  // wait until uniconified
-  while (fb.is_iconified)
-    {
-      glfwWaitEvents();
-      continue;
-    }
-
-  // what's happended?
-  glfwPollEvents();
-
-  // resize?
-  if (fb.is_resized)
-    skc_interop_resize();
-
-  // monitor fps
-  skc_interop_fps(window);
-
-  skc_interop_transform(ts);
-}
-
-//
-//
-//
-
-void
-skc_interop_blit(GLFWwindow * window)
-{
-  // blit skc rbo
-  glBlitNamedFramebuffer(fb.fbo,0,
-                         0,0,fb.width,fb.height,
-                         0,0,fb.width,fb.height,
-                         GL_COLOR_BUFFER_BIT,
-                         GL_NEAREST);
-
-#if 0
-  //
-  // FIXME -- this clear does nothing!
-  //
-  // As a hack we're clearing the interop'd RBO with a
-  // clEnqueueFillImage().
-  //
-  float const rgba[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
-  // GLenum const attachments[] = { GL_COLOR_ATTACHMENT0 };
-  // glInvalidateNamedFramebufferData(fb.fbo,1,attachments);
-  glClearNamedFramebufferfv(fb.fbo,GL_COLOR,0,rgba);
-#endif
-
-  // swap buffers
-  glfwSwapBuffers(window);
-}
-
-//
-//
-//
-
-void *
-skc_interop_get_fb(GLFWwindow * window)
-{
-  glFlush();
-
-  return fb.mem;
-}
-
-//
-//
-//
-
-void
-skc_interop_get_dim(uint32_t dim[2])
-{
-  dim[0] = fb.width;
-  dim[1] = fb.height;
-}
-
-//
-//
-//
-
-
diff --git a/src/compute/skc/interop.h b/src/compute/skc/interop.h
deleted file mode 100644
index 112d365764..0000000000
--- a/src/compute/skc/interop.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "skc.h"
-
-//
-//
-//
-
-void
-skc_interop_init(GLFWwindow * * window);
-
-void
-skc_interop_register(skc_context_t context);
-
-void
-skc_interop_poll(GLFWwindow                 * window,
-                 struct skc_transform_stack * ts);
-
-void *
-skc_interop_get_fb(GLFWwindow * window);
-
-void
-skc_interop_get_dim(uint32_t dim[2]);
-
-void
-skc_interop_blit(GLFWwindow * window);
-
-//
-//
-//
diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c
index 8833b0bb1c..8261f4bdf8 100644
--- a/src/compute/skc/main.c
+++ b/src/compute/skc/main.c
@@ -30,7 +30,7 @@
 //
 
 #include <CL/opencl.h>
-#include "interop.h"
+#include "platforms/cl_12/gl/interop.h"
 
 //
 //
@@ -49,7 +49,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
 //
 //
 
-static
+static 
 void
 is_render_complete(skc_surface_t     surface,
                    skc_styling_t     styling,
@@ -67,9 +67,9 @@ int
 main(int argc, char** argv)
 {
   //
+  // 
   //
-  //
-  if (argc <= 1)
+  if (argc <= 1) 
     {
       fprintf(stderr,"-- missing filename\n");
       return EXIT_FAILURE; // no filename
@@ -110,7 +110,7 @@ main(int argc, char** argv)
       CL_WGL_HDC_KHR,      (cl_context_properties)hDC,
       0
     };
-
+  
   //
   // create context
   //
@@ -136,14 +136,14 @@ main(int argc, char** argv)
   skc_raster_builder_t raster_builder;
 
   err = skc_raster_builder_create(context,&raster_builder);
-
+  
   //
   // create a composition
   //
   skc_composition_t composition;
 
   err = skc_composition_create(context,&composition);
-
+  
   //
   // create a styling instance
   //
@@ -154,7 +154,7 @@ main(int argc, char** argv)
                            svg_doc_layer_count(svg_doc),
                            1000,
                            2 * 1024 * 1024);
-
+  
   //
   // create a surface
   //
@@ -191,7 +191,7 @@ main(int argc, char** argv)
       skc_transform_stack_restore(ts,ts_save);
 
       // decode layers -- places rasters
-      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
+      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);    
 
       // seal the composition
       skc_composition_seal(composition);
@@ -244,7 +244,7 @@ main(int argc, char** argv)
       // unseal the composition
       skc_composition_unseal(composition,true);
     }
-
+  
   //
   // dispose of mundane resources
   //
diff --git a/src/compute/skc/make_all.bat b/src/compute/skc/make_all.bat
deleted file mode 100644
index 4772cc73b4..0000000000
--- a/src/compute/skc/make_all.bat
+++ /dev/null
@@ -1,15 +0,0 @@
-@ECHO OFF
-
-CMD /C make_inl_cl.bat  block_pool_init.cl
-CMD /C make_inl_cl.bat  fills_expand.cl
-CMD /C make_inl_cl.bat  paths_copy.cl
-CMD /C make_inl_cl.bat  rasterize.cl
-CMD /C make_inl_cl.bat  segment_ttrk.cl
-CMD /C make_inl_cl.bat  rasters_alloc.cl
-CMD /C make_inl_cl.bat  prefix.cl
-CMD /C make_inl_cl.bat  place.cl
-CMD /C make_inl_cl.bat  segment_ttck.cl
-CMD /C make_inl_cl.bat  render.cl
-CMD /C make_inl_cl.bat  paths_reclaim.cl
-CMD /C make_inl_cl.bat  rasters_reclaim.cl
-
diff --git a/src/compute/skc/make_inl_cl.bat b/src/compute/skc/make_inl_cl.bat
deleted file mode 100644
index 777a5f3bc2..0000000000
--- a/src/compute/skc/make_inl_cl.bat
+++ /dev/null
@@ -1,72 +0,0 @@
-@ECHO OFF
-
-SET OPENCL_STD=-cl-std=CL1.2
-SET OPENCL_PRE=__OPENCL_C_VERSION__=120
-
-:: OPENCL_STD=-cl-std=CL2.0
-:: OPENCL_PRE=__OPENCL_C_VERSION__=200
-
-::
-::
-::
-
-SET IOC=ioc64
-
-::
-::
-::
-
-SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
-
-SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
-
-SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
-
-::
-::
-::
-
-SET PRE_DIR=%~p1
-
-CD %PRE_DIR%
-
-SET PRE_CL=%~n1
-SET PRE_CL=%PRE_CL%.pre.cl
-
-SET PRE_SRC_INL=%~n1
-SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
-
-SET PRE_BIN_IR=%~n1
-SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
-
-SET PRE_BIN_INL=%~n1
-SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
-
-::
-:: *.pre.cl
-:: *.pre.src.inl
-::
-
-CMD /C cl -I . -I .. -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
-CMD /C clang-format -style=Mozilla -i %PRE_CL%
-CMD /C dos2unix -q %PRE_CL%
-CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
-
-echo %PRE_CL%
-echo %PRE_SRC_INL%
-
-::
-:: *.pre.cl
-:: *.pre.src.inl
-::
-
-CMD /C touch %PRE_BIN_IR%
-ECHO ON
-@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
-@ECHO OFF
-CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
-
-echo %PRE_BIN_IR%
-echo %PRE_BIN_INL%
-
-
diff --git a/src/compute/skc/path_builder_cl_12.c b/src/compute/skc/path_builder_cl_12.c
deleted file mode 100644
index e915dffada..0000000000
--- a/src/compute/skc/path_builder_cl_12.c
+++ /dev/null
@@ -1,1443 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <float.h>
-#include <stdio.h>
-
-#include "common/cl/assert_cl.h"
-
-#include "context.h"
-#include "handle.h"
-#include "grid.h"
-#include "path.h"
-#include "path_builder.h"
-
-#include "config_cl.h"
-#include "export_cl_12.h"
-#include "runtime_cl_12.h"
-#include "path_builder_cl_12.h"
-
-//
-// OpenCL 1.2 devices support mapping of buffers into the host address
-// space.
-//
-// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
-// boundary (e.g. 128 bytes).  This complicates coordinating sharing
-// of data between the host and the device.
-//
-// Some OpenCL 2.0 devices support fine-grained shared virtual memory
-// pointers with byte-addressing and allow simpler coordination
-// strategies at the cost of maintaining cache coherency.
-//
-// The path builder is focused on moving bulk path data from the host
-// into the device-managed "block" memory pool and arranging it into a
-// SIMT/SIMD-friendly data structure that can be efficiently read by
-// the rasterizer.
-//
-// Note that one simplifying assumption is that the maximum length of
-// a *single* path can't be larger than what fits in the single extent
-// (which is split into M subbuffers).  This would be a very long path
-// and a legitimate size limitation.
-//
-// For some systems, it may be appropriate to never pull path data
-// into the device-managed block pool and instead present the path
-// data to the device in a temporarily available allocated memory
-// "zone" of paths that can be discarded all at once.
-//
-// For other systems, it may be appropriate to simply copy the path
-// data from host to device.
-//
-// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
-// targeting support basic map/unmap functionality similar to OpenCL
-// 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
-// sharing of memory and still require a map/unmap step... but note
-// that they all support byte-aligned mapping and subbuffers.
-//
-// The general strategy that this particular CL_12 implementation uses
-// is to allocate a large mappable bulk-data path buffer and an
-// auxilary mappable command buffer.
-//
-// The buffers are split into a reasonable number of properly aligned
-// subbuffers to enable simultaneous host and device access.
-//
-
-//
-// Blocks:
-//   1 extent
-//   M mapped subbuffers (configurable) to allow for concurrency
-//
-// Commands:
-//   1 extent
-//   M mapped subbuffers (configurable) to allow for concurrency
-//
-// Spans:
-//   M hi/lo structures
-//
-// { cl_sub, void*, event, base }
-//
-// - size of sub buffer
-// - remaining
-//
-// - counts
-//
-
-//
-// For any kernel launch, at most one path will be discontiguous and
-// defined across two sub-buffers.
-//
-// Nodes are updated locally until full and then stored so they will
-// never be incomplete.  Headers are stored locally until the path is
-// ended so they will never be incomplete.
-//
-// A line, quad or cubic acquires 4/6/8 segments which may be spread
-// across one or more congtiguous blocks.
-//
-// If a flush() occurs then the remaining columns of multi-segment
-// paths are initialized with zero-length line, quad, cubic elements.
-//
-// Every block's command word has a type and a count acquired from a
-// rolling counter.
-//
-// The kernel is passed two spans of blocks { base, count } to
-// process.  The grid is must process (lo.count + hi.count) blocks.
-//
-
-struct skc_subbuffer_blocks
-{
-  cl_mem   device;
-  void *   host;
-};
-
-struct skc_subbuffer_cmds
-{
-  cl_mem   device;
-  void *   host;
-  cl_event map;
-};
-
-//
-// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
-//
-
-typedef skc_uint skc_ringdex_t;
-
-union skc_ringdex_expand
-{
-  div_t      qr;
-
-  struct {
-#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
-    skc_uint subbuf;
-    skc_uint block;
-#else
-    skc_uint block;
-    skc_uint subbuf;
-#endif
-  };
-};
-
-//
-// this record is executed by the grid
-//
-
-struct skc_release_record
-{
-  struct skc_path_builder_impl * impl; // back pointer to impl
-
-  skc_grid_t                     grid; // pointer to scheduled grid
-
-  skc_uint                       from; // inclusive starting index   : [from,to)
-  skc_uint                       to;   // non-inclusive ending index : [from,to)
-};
-
-//
-//
-//
-
-struct skc_path_builder_impl
-{
-  struct skc_path_builder       * path_builder;
-
-  struct skc_runtime            * runtime;
-
-  cl_command_queue                cq;
-
-  struct {
-    cl_kernel                     alloc;
-    cl_kernel                     copy;
-  } kernels;
-
-  //
-  // FIXME -- make this pointer to constant config
-  //
-  // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
-  struct {
-    skc_uint                      subbufs;  // how many subbufs in the buffer?
-
-    struct {
-      skc_uint                    buffer;   // how many blocks in the buffer?
-      skc_uint                    subbuf;   // how many blocks in a   subbuf?
-    } blocks_per;
-  } ring;
-  //
-  // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
-  //
-
-  struct {
-    cl_mem                        buffer;   // backing buffer for blocks
-    struct skc_subbuffer_blocks * subbufs;  // array of structures
-  } blocks;
-
-  struct {
-    cl_mem                        buffer;   // backing buffer for commands
-    struct skc_subbuffer_cmds   * subbufs;  // array of structures
-  } cmds;
-
-  struct {
-    struct skc_release_record   * records;  // max release records is equal to max subbufs
-    skc_path_t                  * paths;    // max paths is less than or equal to max commands
-  } release;
-
-  cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"
-
-  struct {
-    skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
-    skc_ringdex_t                 from;
-    skc_ringdex_t                 to;
-  } prev;
-
-  struct {
-    skc_ringdex_t                 from;
-    skc_ringdex_t                 to;
-  } curr;
-
-  struct {
-    struct skc_path_head        * head;     // pointer to local path header -- not written until path end
-    struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full
-
-    struct {
-      skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
-      union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
-      skc_uint                    rem;      // how many id slots left in node block
-    } ids;
-
-    struct {
-      skc_uint                    rem;      // how many subblocks left in block?
-      skc_uint                    rolling;  // rolling counter of block of subblocks
-      float                     * next;     // next subblock in current subblock block
-      skc_uint                    idx;      // index of next subblock
-    } subblocks;
-
-    struct {
-      skc_uint                    one;      // .block = 1
-      skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
-    } rolling;
-
-    skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
-  } wip;
-};
-
-//
-// FIXME -- move to a pow2 subbuffer size and dispense with division
-// and modulo operations
-//
-
-static
-union skc_ringdex_expand
-skc_ringdex_expand(struct skc_path_builder_impl * const impl, 
-                   skc_ringdex_t                  const ringdex)
-{
-  return (union skc_ringdex_expand){
-    .qr = div(ringdex,impl->ring.blocks_per.subbuf)
-  };
-}
-
-static
-void
-skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
-{
-  //
-  // FIXME - which is faster?
-  //
-#if 1
-  impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
-#else
-  impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
-#endif
-
-  // this path is too long -- for now assert() and die
-  assert(impl->wip.to != impl->curr.from);
-}
-
-static
-skc_ringdex_t
-skc_ringdex_span(struct skc_path_builder_impl * const impl,
-                 skc_ringdex_t                  const from,
-                 skc_ringdex_t                  const to)
-{
-  return (to - from) % impl->ring.blocks_per.buffer;
-}
-
-static
-void
-skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
-{
-  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
-
-  // nothing to do if this is the first block in the subbuf
-  if (to.block == 0)
-    return;
-
-  skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
-
-  // otherwise increment and mod
-  impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
-}
-
-static
-skc_bool
-skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
-{
-  return impl->curr.from == impl->curr.to;
-}
-
-static
-skc_bool
-skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
-{
-  return impl->prev.from == impl->prev.to;
-}
-
-static
-skc_uint
-skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, 
-                          skc_uint                       const to_block)
-{
-  // no blocks acquired OR this is last block in subbuf
-  return !((impl->wip.to == impl->curr.to) || (to_block == 0));
-}
-
-//
-//
-//
-
-static
-struct skc_release_record *
-skc_release_curr(struct skc_path_builder_impl * const impl)
-{
-  union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
-
-  return impl->release.records + curr_from.subbuf;
-}
-
-//
-// FIXME -- get rid of all distant config references -- grab them at all at creation time
-//
-
-static
-void
-skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
-{
-  // init header counters // { handle, blocks, nodes, prims }
-  impl->wip.head->header = (union skc_path_header){
-    .handle = 0,
-    .blocks = 0,
-    .nodes  = 0,
-    .prims  = 0
-  };
-
-  // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
-  impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
-
-  // point wip ids at local head node
-  impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
-  impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
-
-  // start with no subblocks
-  impl->wip.subblocks.rem = 0;
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
-{
-#if 1
-  //
-  // FIXME -- a Duff's device might be optimal here but would have to
-  // be customized per device since node's could be 16-128+ words
-  //
-  while (impl->wip.ids.rem > 0)
-    {
-      impl->wip.ids.rem      -= 1;
-      impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
-      impl->wip.ids.next     += 1;
-    }
-#else
-  memset(&impl->wip.ids.next->u32,
-         SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
-         sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
-
-  impl->wip.ids.next += impl->wip.ids.rem;
-  impl->wip.ids.rem   = 0;
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_zero_float(skc_float * p, skc_uint rem)
-{
-  memset(p,0,sizeof(*p)*rem);
-}
-
-static
-void
-skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
-{
-  //
-  // FIXME -- it might be more performant to zero the remaining
-  // columns in a subblock -- a subblock at a time -- instead of the
-  // same column across all the subblocks
-  //
-#if 0
-  while (path_builder->line.rem > 0)
-    {
-      --path_builder->line.rem;
-
-      *path_builder->line.coords[0]++ = 0.0f;
-      *path_builder->line.coords[1]++ = 0.0f;
-      *path_builder->line.coords[2]++ = 0.0f;
-      *path_builder->line.coords[3]++ = 0.0f;
-    }
-
-  while (path_builder->quad.rem > 0)
-    {
-      --path_builder->quad.rem;
-
-      *path_builder->line.coords[0]++ = 0.0f;
-      *path_builder->line.coords[1]++ = 0.0f;
-      *path_builder->line.coords[2]++ = 0.0f;
-      *path_builder->line.coords[3]++ = 0.0f;
-      *path_builder->line.coords[4]++ = 0.0f;
-      *path_builder->line.coords[5]++ = 0.0f;
-    }
-
-  while (path_builder->cubic.rem > 0)
-    {
-      --path_builder->cubic.rem;
-
-      *path_builder->line.coords[0]++ = 0.0f;
-      *path_builder->line.coords[1]++ = 0.0f;
-      *path_builder->line.coords[2]++ = 0.0f;
-      *path_builder->line.coords[3]++ = 0.0f;
-      *path_builder->line.coords[4]++ = 0.0f;
-      *path_builder->line.coords[5]++ = 0.0f;
-      *path_builder->line.coords[6]++ = 0.0f;
-      *path_builder->line.coords[7]++ = 0.0f;
-    }
-#else
-  if (path_builder->line.rem > 0)
-    {
-      skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
-      skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
-      skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
-      skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
-
-      path_builder->line.rem = 0;
-    }
-
-  if (path_builder->quad.rem > 0)
-    {
-      skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
-      skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
-      skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
-      skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
-      skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
-      skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
-
-      path_builder->quad.rem = 0;
-    }
-
-  if (path_builder->cubic.rem > 0)
-    {
-      skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
-      skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
-
-      path_builder->cubic.rem = 0;
-    }
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
-                            skc_uint                             from,
-                            skc_uint                             to)
-{
-  // to might be out of range
-  to = to % impl->ring.subbufs;
-
-#if 0
-  fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
-#endif
-  
-  while (from != to) // 'to' might be out of range
-    {
-      // bring 'from' back in range
-      from = from % impl->ring.subbufs;
-
-      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
-      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
-
-      cl(EnqueueUnmapMemObject(impl->cq,
-                               blocks->device,
-                               blocks->host,
-                               0,NULL,NULL));
-
-      cl(EnqueueUnmapMemObject(impl->cq,
-                               cmds->device,
-                               cmds->host,
-                               0,NULL,NULL));
-
-      // bring from back in range
-      from = ++from % impl->ring.subbufs;
-    }
-}
-
-//
-// FIXME -- reuse this in create()
-//
-
-static
-void
-skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
-                          skc_uint                             from,
-                          skc_uint                             to)
-{
-  // to might be out of range
-  to = to % impl->ring.subbufs;
-
-#if 0
-  fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
-#endif
-
-  while (from != to)
-    {
-      cl_int cl_err;
-
-      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
-      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
-
-      blocks->host = clEnqueueMapBuffer(impl->cq,
-                                        blocks->device,
-                                        CL_FALSE,
-                                        CL_MAP_WRITE_INVALIDATE_REGION,
-                                        0,impl->runtime->config->paths_copy.block.subbuf,
-                                        0,NULL,NULL,
-                                        &cl_err); cl_ok(cl_err);
-
-      cl(ReleaseEvent(cmds->map));
-
-      cmds->host   = clEnqueueMapBuffer(impl->cq,
-                                        cmds->device,
-                                        CL_FALSE,
-                                        CL_MAP_WRITE_INVALIDATE_REGION,
-                                        0,impl->runtime->config->paths_copy.command.subbuf,
-                                        0,NULL,&cmds->map,
-                                        &cl_err); cl_ok(cl_err);
-
-      // bring from back in range
-      from = ++from % impl->ring.subbufs;
-    }
-  //
-  // FIXME -- when we switch to out of order queues we'll need a barrier here
-  //
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_release_dispose(struct skc_release_record    * const release,
-                                 struct skc_path_builder_impl * const impl)
-{
-  struct skc_runtime * runtime = impl->runtime;
-
-  if (release->from <= release->to) // no wrap
-    {
-      skc_path_t const * paths = impl->release.paths + release->from;
-      skc_uint           count = release->to         - release->from;
-
-      skc_grid_deps_unmap(runtime->deps,paths,count);
-      skc_runtime_path_device_release(runtime,paths,count);
-    }
-  else // from > to implies wrap
-    {
-      skc_path_t const * paths_lo = impl->release.paths + release->from;
-      skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;
-
-      skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
-      skc_runtime_path_device_release(runtime,paths_lo,count_lo);
-
-      skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
-      skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
-    }
-
-  release->to = release->from;
-}
-
-static
-void
-skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
-{
-  struct skc_release_record    * const release = skc_grid_get_data(grid);
-  struct skc_path_builder_impl * const impl    = release->impl;
-
-  skc_path_builder_release_dispose(release,impl);
-}
-
-static
-void
-// skc_path_builder_complete(struct skc_release_record * const release)
-skc_path_builder_complete(skc_grid_t grid)
-{
-  //
-  // notify deps that this grid is complete enough for other grids to
-  // proceed
-  //
-  // the path builder still has some cleanup to do before all its
-  // resources can be reused
-  //
-  skc_grid_complete(grid);
-}
-
-static
-void
-skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
-{
-  SKC_CL_CB(status);
-  
-  struct skc_release_record * const release = skc_grid_get_data(grid);
-  
-  SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
-{
-  struct skc_release_record    * const release = skc_grid_get_data(grid);
-  struct skc_path_builder_impl * const impl    = release->impl;
-
-  // 1. flush incomplete subblocks of path elements
-  // 2. unmap subbuffer on cq.unmap
-  // 3. flush cq.unmap
-  // 4. launch kernel on cq.kernel but wait for unmap completion
-  // 5. flush cq.kernel
-  // 6. remap relevant subbuffers on cq.map but wait for kernel completion
-  // 7. flush cq.map
-
-  //
-  // FIXME -- can be smarter about flushing if the wip paths are not
-  // in the same subbuf as curr.to
-  //
-  // THIS IS IMPORTANT TO FIX
-  //
-
-  // flush incomplete subblocks
-  skc_path_builder_finalize_subblocks(impl->path_builder);
-
-  //
-  // get range of subbufs that need to be unmapped
-  //
-  // note that impl->prev subbufs have already been unmapped
-  //
-  union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
-  union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
-  skc_uint                 const is_partial = curr_to.block > 0;
-  skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;
-
-  //
-  // unmap all subbufs in range [from,to)
-  //
-  skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
-
-  //
-  // launch kernels
-  //
-  skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
-  skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
-  skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;
-
-  //
-  // 1) allocate blocks from pool
-  //
-
-  //
-  // FIXME -- pack integers into struct/vector
-  //
-  cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
-  cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
-  cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
-  cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
-
-  skc_device_enqueue_kernel(impl->runtime->device,
-                            SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
-                            impl->cq,
-                            impl->kernels.alloc,
-                            1,
-                            0,NULL,NULL);
-
-  //
-  // 2) copy blocks from unmapped device-accessible memory
-  //
-
-  //
-  // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
-  //
-  cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
-
-  cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
-  cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
-  cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
-
-  cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
-  cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
-
-  cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
-  cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
-
-  cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
-  cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
-
-  cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
-  cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
-  cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
-
-  cl_event complete;
-
-  skc_device_enqueue_kernel(impl->runtime->device,
-                            SKC_DEVICE_KERNEL_ID_PATHS_COPY,
-                            impl->cq,
-                            impl->kernels.copy,
-                            pb_cmds,
-                            0,NULL,&complete);
-
-  // set a callback on completion
-  cl(SetEventCallback(complete,CL_COMPLETE,
-                      skc_path_builder_paths_copy_cb,
-                      grid));
-
-  // immediately release
-  cl(ReleaseEvent(complete));
-
-  //
-  // remap as many subbuffers as possible after the kernel completes
-  //
-  // note that remaps are async and enqueued on the same command queue
-  // as the kernel launch
-  //
-  // we can't remap subbuffers that are in the possibly empty range
-  //
-  // cases:
-  //
-  //   - curr.to == wip.to which means no blocks have been acquired
-  //   - curr.to points to first block in (next) subbuf
-  //   - otherwise, wip acquired blocks in the curr.to subbuf
-  //
-  // check for these first 2 cases!
-  //
-  union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
-  skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
-  skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);
-
-  // remap all subbufs in range [from,to)
-  skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
-
-  // flush command queue
-  cl(Flush(impl->cq));
-
-  // save rolling
-  impl->prev.rolling = impl->wip.rolling.next;
-
-  // update prev and curr
-  if (no_wip)
-    {
-      //
-      // if there was no wip then round up to the next subbuf
-      //
-      skc_ringdex_wip_to_subbuf_inc(impl);
-    
-      //
-      // update prev/curr with with incremented wip
-      //
-      impl->prev.from = impl->prev.to = impl->wip.to;
-      impl->curr.from = impl->curr.to = impl->wip.to;
-    }
-  else
-    {
-      //
-      // update prev with wip partials
-      //
-      impl->prev.from    = impl->curr.to;
-      impl->prev.to      = impl->wip .to;
-
-      //
-      // start curr on a new subbuf boundary
-      //
-      skc_ringdex_wip_to_subbuf_inc(impl);
-
-      impl->curr.from    = impl->wip.to;
-      impl->curr.to      = impl->wip.to;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
-                                        skc_uint                       const subbuf)
-{
-  //
-  // FIXME -- move to a power-of-two subbuf size and kickstart path
-  // copies as early as possible
-  //
-  // FIXME -- the subbufs "self-clock" (flow control) the kernel
-  // launches and accounting.  Combine all the subbuffers and release
-  // records into a single indexable struct instead of 3.
-  //
-  struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
-  struct skc_release_record * const release   = impl->release.records + subbuf;
-  struct skc_scheduler      * const scheduler = impl->runtime->scheduler;
-
-  // can't proceed until the paths have been released
-  SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
-
-  // throw in a scheduler yield ... FIXME -- get rid of
-  skc_scheduler_yield(scheduler);
-
-  // can't proceed until the subbuffer is mapped
-  cl(WaitForEvents(1,&sc->map));
-}
-
-//
-//
-//
-
-static
-union skc_ringdex_expand
-skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
-{
-  // break ringdex into components
-  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
-
-  // does wip ringdex point to a new subbuffer?
-  if (to.block == 0)
-    {
-      // potentially spin/block waiting for subbuffer
-      skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
-    }
-
-  // post increment wip.to
-  skc_ringdex_wip_to_block_inc(impl);
-
-  return to;
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_rolling_block(skc_uint const rolling, skc_uint const tag)
-{
-  return rolling | tag;
-}
-
-static
-skc_uint
-skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
-{
-  return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
-}
-
-static
-void
-skc_rolling_inc(struct skc_path_builder_impl * const impl)
-{
-  impl->wip.rolling.next += impl->wip.rolling.one;
-}
-
-//
-//
-//
-
-static
-void *
-skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
-                                  skc_uint                       const rolling,
-                                  skc_cmd_paths_copy_tag         const tag)
-{
-  // bump blocks count
-  impl->wip.head->header.blocks += 1;
-
-  // acquire a block
-  union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);
-
-  // make a pointer
-  union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
-
-  // store command for block
-  cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
-
-#if 0
-  // store command for block
-  cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
-
-  // increment rolling
-  skc_rolling_inc(impl);
-#endif
-
-  // return pointer to block
-  float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
-
-  // FIXME -- make it easier to get config constant
-  return blocks_subbuf + (to.block * impl->runtime->config->block.words);
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
-{
-  // store command to subbuf and get pointer to blocks subbuf
-  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
-                                                         SKC_CMD_PATHS_COPY_TAG_NODE);
-
-  // copy head to blocks subbuf -- write-only
-  memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
-}
-
-static
-void
-skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
-{
-  // store command to subbuf and get pointer to blocks subbuf
-  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
-                                                         SKC_CMD_PATHS_COPY_TAG_HEAD);
-
-  // copy head to blocks subbuf -- write-only
-  memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
-
-  // increment rolling
-  skc_rolling_inc(impl);
-
-  // the 'to' index is non-inclusive so assign wip.to after flush_head
-  impl->curr.to = impl->wip.to;
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
-{
-  // update final block id in node
-  impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
-
-  // if wip.ids is not the header then flush now full wip node
-  if (impl->wip.head->header.nodes > 0)
-    skc_path_builder_impl_flush_node(impl);
-
-  // bump node count
-  impl->wip.head->header.nodes += 1;
-
-  // save current rolling
-  impl->wip.ids.rolling = impl->wip.rolling.next;
-
-  // increment rolling
-  skc_rolling_inc(impl);
-
-  // update wip.ids.*
-  impl->wip.ids.next = impl->wip.node->tag_ids;
-  impl->wip.ids.rem  = impl->runtime->config->block.words;
-}
-
-static
-void
-skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
-{
-  impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
-  impl->wip.subblocks.rolling = impl->wip.rolling.next;
-  impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
-                                                                  SKC_CMD_PATHS_COPY_TAG_SEGS);
-  impl->wip.subblocks.idx     = 0;
-
-  // increment rolling
-  skc_rolling_inc(impl);
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
-                                        skc_block_id_tag                     tag,
-                                        skc_uint                             vertices,
-                                        float * *                            subblocks)
-{
-  //
-  // FIRST TAG RECORDS THE ELEMENT TYPE
-  //
-  while (true)
-    {
-      // if only one block id left in node then acquire new node block
-      // and append its block id as with a next tag
-      if (impl->wip.ids.rem == 1)
-        skc_path_builder_impl_new_node_block(impl);
-
-      // if zero subblocks left then acquire a new subblock block and
-      // append its block id
-      if (impl->wip.subblocks.rem == 0)
-        skc_path_builder_impl_new_segs_block(impl);
-
-      // save first command -- tag and subblocks may have been updated
-      impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
-
-      // increment node block subblock pointer
-      impl->wip.ids.next += 1;
-      impl->wip.ids.rem  -= 1;
-
-      // how many vertices can we store
-      skc_uint rem = min(vertices,impl->wip.subblocks.rem);
-
-      // decrement vertices
-      vertices                -= rem;
-      impl->wip.subblocks.rem -= rem;
-      impl->wip.subblocks.idx += rem;
-
-      // assign subblocks
-      do {
-        *subblocks++              = impl->wip.subblocks.next;
-        impl->wip.subblocks.next += impl->runtime->config->subblock.words;
-        // FIXME -- move constants closer to structure
-      } while (--rem > 0);
-
-      // anything left to do?
-      if (vertices == 0)
-        break;
-
-      // any tag after this will be a caboose command
-      tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
-{
-  // finalize incomplete active subblocks -- we don't care about any
-  // remaining unused subblocks in block
-  skc_path_builder_finalize_subblocks(impl->path_builder);
-
-  // mark remaining wips.ids in the head or node as invalid
-  skc_path_builder_impl_finalize_node(impl);
-
-  // flush node if rem > 0 and node is not actually head
-  if (impl->wip.head->header.nodes >= 1)
-    skc_path_builder_impl_flush_node(impl);
-
-  // acquire path host id
-  *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
-
-  // save path host handle
-  impl->wip.head->header.handle = *path;
-
-  // flush head -- acquires a block and bumps head->header.blocks
-  skc_path_builder_impl_flush_head(impl);
-
-  // get current release
-  struct skc_release_record * const release = skc_release_curr(impl);
-
-  // acquire grid if null
-  if (release->grid == NULL)
-    {
-      release->grid =
-        SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
-                             &release->grid, // NULL on start/force
-                             release,        // data payload
-                             skc_path_builder_grid_pfn_waiting,
-                             NULL,           // no execute pfn
-                             skc_path_builder_grid_pfn_dispose);
-    }
-
-  // update grid map
-  skc_grid_map(release->grid,*path);
-
-  // update path release
-  impl->release.paths[release->to] = *path;
-
-  // increment release.to
-  release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
-
-  // add guard bit
-  *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
-
-#if 1
-  //
-  // eager kernel launch?
-  //
-  {
-    union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
-    union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);
-
-    if (curr_from.subbuf != curr_to.subbuf)
-      {
-        skc_grid_start(release->grid);
-        // skc_scheduler_yield(impl->runtime->scheduler);
-      }
-  }
-#endif
-}
-
-//
-// FIXME -- clean up accessing of CONFIG constants in these 3 routines
-//
-
-static
-void
-skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
-{
-  // acquire subblock pointers
-  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
-                                          impl->path_builder->line.coords);
-
-  // increment line count
-  impl->wip.head->header.prims += 1;
-
-  // update rem_count_xxx count
-  impl->path_builder->line.rem = impl->runtime->config->subblock.words;
-}
-
-static
-void
-skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
-{
-  // acquire subblock pointers
-  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
-                                          impl->path_builder->quad.coords);
-
-  // increment line count
-  impl->wip.head->header.prims += 1;
-
-  // update rem_count_xxx count
-  impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
-}
-
-static
-void
-skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
-{
-  // acquire subblock pointers
-  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
-                                          impl->path_builder->cubic.coords);
-
-  // increment line count
-  impl->wip.head->header.prims += 1;
-
-  // update rem_count_xxx count
-  impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
-}
-
-//
-//
-//
-
-static
-void
-skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
-{
-  // decrement reference count
-  if (--impl->path_builder->refcount != 0)
-    return;
-
-  //
-  // otherwise, dispose of everything
-  //
-  struct skc_runtime * const runtime = impl->runtime;
-
-  // free path builder
-  skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
-
-  // release cq
-  skc_runtime_release_cq_in_order(runtime,impl->cq);
-
-  // release kernels
-  cl(ReleaseKernel(impl->kernels.alloc));
-  cl(ReleaseKernel(impl->kernels.copy));
-
-  // free blocks extents
-  cl(ReleaseMemObject(impl->blocks.buffer));
-  skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
-
-  cl(ReleaseMemObject(impl->cmds.buffer));
-  skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
-
-  // free records
-  skc_runtime_host_perm_free(runtime,impl->release.records);
-  skc_runtime_host_perm_free(runtime,impl->release.paths);
-
-  // release staging head and node
-  skc_runtime_host_perm_free(runtime,impl->wip.head);
-  skc_runtime_host_perm_free(runtime,impl->wip.node);
-
-  // release reads scratch array
-  cl(ReleaseMemObject(impl->reads));
-
-  // for all subbuffers
-  //   unmap   subbuffer
-  //   release subbuffer
-  // printf("%s not releasing subbuffers\n",__func__);
-
-  skc_runtime_host_perm_free(impl->runtime,impl);
-}
-
-//
-//
-//
-
-skc_err
-skc_path_builder_cl_12_create(struct skc_context        * const context,
-                              struct skc_path_builder * * const path_builder)
-{
-  //
-  // retain the context
-  // skc_context_retain(context);
-  //
-  struct skc_runtime * const runtime = context->runtime;
-
-  // allocate path builder
-  (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
-
-  // init state
-  SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
-
-  (*path_builder)->context    = context;
-
-  // save opaque impl-specific pointers
-  (*path_builder)->begin      = skc_path_builder_pfn_begin;
-  (*path_builder)->end        = skc_path_builder_pfn_end;
-  (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
-  (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
-  (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
-  (*path_builder)->release    = skc_path_builder_pfn_release;
-
-  // initialize path builder counts
-  (*path_builder)->line.rem   = 0;
-  (*path_builder)->quad.rem   = 0;
-  (*path_builder)->cubic.rem  = 0;
-
-  (*path_builder)->refcount   = 1;
-
-  struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
-
-  (*path_builder)->impl       = impl;
-
-  //
-  // init impl
-  //
-  impl->path_builder  = *path_builder;
-  impl->runtime       = runtime;
-
-  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
-
-  impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
-  impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
-
-  //
-  // FIXME -- let these config constants remain constant and in place
-  //
-  struct skc_config const * const config = runtime->config;
-
-  impl->ring.subbufs           = config->paths_copy.buffer.count;
-  impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
-  impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
-  //
-  // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  //
-
-  cl_int cl_err;
-
-  // allocate large device-side extent for path data
-  impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
-                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                         config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
-                                         NULL,&cl_err); cl_ok(cl_err);
-
-  // allocate small host-side array of pointers to mapped subbufs
-  impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
-                                                      impl->ring.subbufs *
-                                                      sizeof(*impl->blocks.subbufs));
-
-  // allocate large device-side extent for path copy commands
-  impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
-                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                         config->paths_copy.command.buffer,
-                                         NULL,&cl_err); cl_ok(cl_err);
-
-  // allocate small host-side array of pointers to mapped subbufs
-  impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
-                                                      impl->ring.subbufs *
-                                                      sizeof(*impl->cmds.subbufs));
-
-  // allocate small host-side array of intervals of path handles
-  impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
-                                                      impl->ring.subbufs *
-                                                      sizeof(*impl->release.records));
-
-  // allocate large host-side array that is max # of path handles in flight
-  impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
-                                                      impl->ring.blocks_per.buffer *
-                                                      sizeof(*impl->release.paths));
-
-  // small scratch used by kernels
-  impl->reads           = clCreateBuffer(runtime->cl.context,
-                                         CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
-                                         sizeof(skc_uint) * impl->ring.subbufs,
-                                         NULL,&cl_err); cl_ok(cl_err);
-
-  // initialize release record with impl backpointer
-  for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
-    {
-      struct skc_release_record * record = impl->release.records + ii;
-
-      record->impl = impl;
-      record->grid = NULL;
-      record->from = record->to = ii * impl->ring.blocks_per.subbuf;
-    }
-
-  //
-  // allocate and map subbuffers -- we always check the command
-  // subbuffer's map/unmap events before touching it or its associated
-  // block subbuffer.
-  //
-  struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
-  struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;
-
-  cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
-  cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };
-
-  // for each subbuffer
-  for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
-    {
-      sb->device = clCreateSubBuffer(impl->blocks.buffer,
-                                     CL_MEM_HOST_WRITE_ONLY,
-                                     CL_BUFFER_CREATE_TYPE_REGION,
-                                     &rb,
-                                     &cl_err); cl_ok(cl_err);
-
-      sb->host   = clEnqueueMapBuffer(impl->cq,
-                                      sb->device,
-                                      CL_FALSE,
-                                      CL_MAP_WRITE_INVALIDATE_REGION,
-                                      0,rb.size,
-                                      0,NULL,NULL,
-                                      &cl_err); cl_ok(cl_err);
-
-      sc->device = clCreateSubBuffer(impl->cmds.buffer,
-                                     CL_MEM_HOST_WRITE_ONLY,
-                                     CL_BUFFER_CREATE_TYPE_REGION,
-                                     &rc,
-                                     &cl_err); cl_ok(cl_err);
-
-      sc->host   = clEnqueueMapBuffer(impl->cq,
-                                      sc->device,
-                                      CL_FALSE,
-                                      CL_MAP_WRITE_INVALIDATE_REGION,
-                                      0,rc.size,
-                                      0,NULL,&sc->map,
-                                      &cl_err); cl_ok(cl_err);
-      sb        += 1;
-      sc        += 1;
-
-      rb.origin += rb.size;
-      rc.origin += rc.size;
-    }
-
-  //
-  // initialize remaining members
-  //
-  impl->prev.from        = 0;
-  impl->prev.to          = 0;
-  impl->prev.rolling     = 0;
-
-  impl->curr.from        = 0;
-  impl->curr.to          = 0;
-
-  impl->wip.to           = 0;
-
-  impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
-  impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
-
-  impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
-  impl->wip.rolling.next = 0;
-
-  // for now, completely initialize builder before returning
-  cl(Finish(impl->cq));
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/path_builder_cl_12.h b/src/compute/skc/path_builder_cl_12.h
deleted file mode 100644
index 20bb13cbdf..0000000000
--- a/src/compute/skc/path_builder_cl_12.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef PATH_BUILDER_CL_12_ONCE
-#define PATH_BUILDER_CL_12_ONCE
-
-//
-//
-//
-
-#include "block.h"
-
-//
-// A tag type that fits into the block id tag bitfield
-//
-
-typedef enum skc_cmd_paths_copy_tag {
-
-  SKC_CMD_PATHS_COPY_TAG_SEGS,
-  SKC_CMD_PATHS_COPY_TAG_NODE,
-  SKC_CMD_PATHS_COPY_TAG_HEAD,
-
-  SKC_CMD_PATHS_COPY_TAG_COUNT
-
-} skc_cmd_paths_copy_tag;
-
-
-SKC_STATIC_ASSERT(SKC_CMD_PATHS_COPY_TAG_COUNT <= SKC_BLOCK_ID_TAG_COUNT);
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
-
diff --git a/src/compute/skc/paths_copy.cl b/src/compute/skc/paths_copy.cl
deleted file mode 100644
index 06cc393c75..0000000000
--- a/src/compute/skc/paths_copy.cl
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "path_builder_cl_12.h"
-#include "path.h"
-#include "block_pool_cl.h"
-
-//
-//
-//
-
-#if 0
-
-//
-// SIMD AVX2
-//
-
-#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
-#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
-
-typedef skc_uint8  skc_paths_copy_elem;
-typedef skc_uint8  skc_pb_idx_v;
-
-#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
-
-#endif
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
-
-// FIXME -- use SUBGROUP terminology everywhere
-#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-//
-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
-//
-
-#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
-
-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
-
-//
-//
-//
-
-skc_uint
-skc_sub_group_local_id()
-{
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
-  return get_sub_group_local_id();
-#else
-  return 0;
-#endif
-}
-
-//
-// convert an atomic read counter offset to a block id
-//
-
-skc_block_id_t
-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
-                 skc_uint                        const bp_idx_mask,
-                 skc_uint                        const bp_reads,
-                 skc_uint                        const bp_off)
-{
-  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
-
-  return bp_ids[bp_idx];
-}
-
-//
-//
-//
-
-void
-skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx)
-{
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
-    }
-
-#if 0
-  //
-  // NOTE THIS IS PRINTING 8 ROWS
-  //
-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
-         (skc_uint)get_global_id(0),pb_elems_idx,
-         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
-         (skc_uint)get_global_id(0),pb_elems_idx,
-         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-#endif
-}
-
-//
-//
-//
-
-void
-skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_block_id_t      const * const bp_ids,
-              skc_uint                             const bp_reads,
-              skc_uint                             const bp_idx_mask,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx,
-              skc_uint                             const pb_rolling)
-{
-  //
-  // remap block id tags bp_elems the host-side rolling counter pb_elems a
-  // device-side block pool id
-  //
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      // load block_id_tag words
-      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
-
-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // only convert if original elem is not invalid
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
-        skc_block_id_t const b = bp_ids[bp_idx C];              \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
-      }
-
-      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
-      
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem back
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-}
-
-//
-//
-//
-
-void
-skc_host_map_update(__global skc_uint * const host_map,
-                    skc_uint            const block,
-                    skc_paths_copy_elem const elem)
-{
-  //
-  // write first elem to map -- FIXME -- this is a little nasty
-  // because it relies on the the host handle always being the first
-  // word in the path header.
-  //
-  // OTOH, this is not unreasonable.  The alternative is to have a
-  // separate kernel initializing the map.
-  //
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
-  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
-#endif
-    {
-#if SKC_PATHS_COPY_ELEM_WORDS == 1
-      host_map[elem] = block; 
-#if 0
-      printf("[%u] = %u\n",elem,block);
-#endif
-#else
-      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
-#endif
-    }
-}
-
-//
-//
-//
-
-void
-skc_copy_head(__global skc_uint                  * const host_map,
-              skc_uint                             const block,
-              __global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_block_id_t      const * const bp_ids,
-              skc_uint                             const bp_reads,
-              skc_uint                             const bp_idx_mask,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx,
-              skc_uint                             const pb_rolling)
-{
-  //
-  // if there are more path header words than there are
-  // threads-per-block then we can just copy the initial header words
-  //
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
-
-      (bp_elems+bp_elems_idx)[ii] = elem;
-
-      if (ii == 0) {
-        skc_host_map_update(host_map,block,elem);
-      }
-    }
-#endif
-
-  //
-  // this is similar to copy node but the first H words of the path
-  // header are not modified and simply copied
-  //
-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
-      if (ii == 0) {
-        skc_host_map_update(host_map,block,elem);
-      }
-#endif
-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
-      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      // FIXME -- MIX MIX MIX MIX / SELECT
-
-      // only convert if original elem is not invalid
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
-        skc_block_id_t const b = bp_ids[bp_idx C];                      \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
-      }
-
-      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
-
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem back
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-
-  //
-  // the remaining words are treated like a node
-  //
-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      // load block_id_tag words
-      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
-
-      // calculate ahead of time
-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      // only convert if original elem is not invalid
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
-        skc_block_id_t const b = bp_ids[bp_idx C];              \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
-      }
-
-      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
-
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-}
-
-//
-// FIXME -- pack some of these constant integer args in a vec or struct
-//
-
-__kernel
-SKC_PATHS_COPY_KERNEL_ATTRIBS
-void
-skc_kernel_paths_copy
-(__global skc_uint                        * const host_map,
-
- __global skc_block_id_t            const * const bp_ids,
- __global skc_paths_copy_elem             * const bp_elems,
- skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
-
- __global skc_uint                  const * const bp_alloc,    // block pool ring base
- skc_uint                                   const bp_alloc_idx,// which subbuf
-
- __global union skc_tagged_block_id const * const pb_cmds,
- __global skc_paths_copy_elem       const * const pb_elems,
-
- skc_uint                                   const pb_size,     // # of commands/blocks in buffer
- skc_uint                                   const pb_rolling,  // shifted rolling counter base
-
- skc_uint                                   const pb_prev_from,
- skc_uint                                   const pb_prev_span,
- skc_uint                                   const pb_curr_from)
-{
-  //
-  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
-  //
-  // - HEAD
-  // - NODE
-  // - SEGS
-  //
-  // THESE ARE SUBGROUP ORIENTED KERNELS
-  //
-  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
-  //
-
-  //
-  // It's likely that peak bandwidth is achievable with a single
-  // workgroup.
-  //
-  // So let's keep the grids modestly sized and for simplicity and
-  // portability, let's assume that a single workgroup can perform all
-  // steps in the copy.
-  //
-  // Launch as large of a workgroup as possiblex
-  //
-  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
-  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
-  // 3. FOR EACH COMMAND:
-  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
-  //      - NODE: CONVERT AND COPY B INDICES
-  //      - SEGS: BULK COPY
-  //
-  // B : number of words in block -- always pow2
-  // W : intelligently/arbitrarily chosen factor of B -- always pow2
-  //
-
-  //
-  // There are several approaches to processing the commands:
-  //
-  // 1. B threads are responsible for one block. All threads broadcast
-  //    load a single command word. Workgroup size must be a facpb_elemsr of
-  //    B.
-  //
-  // 2. W threads process an entire block. W will typically be the
-  //    device's subgroup/warp/wave width. W threads broadcast load a
-  //    single command word.
-  //
-  // 3. W threads process W blocks. W threads load W command words and
-  //    process W blocks.
-  //
-  // Clearly (1) has low I/O intensity but will achieve high
-  // parallelism by activating the most possible threads. The downside
-  // of this kind of approach is that the kernel will occupy even a
-  // large GPU with low intensity work and reduce opportunities for
-  // concurrent kernel execution (of other kernels).
-  //
-  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
-  //
-  // Note that there are many other approaches.  For example, similar
-  // pb_elems (1) but each thread loads a pow2 vector of block data.
-  //
-
-  // load the copied atomic read "base" from gmem
-  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
-  // will always be less than 2^32
-  skc_uint const gid      = get_global_id(0);
-  // every subgroup/simd that will work on the block loads the same command
-  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
-  // path builder data can be spread across two spans
-  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
-
-  // no need pb_elems make this branchless
-  if (pb_idx >= pb_size)
-    pb_idx -= pb_size;
-
-  // broadcast load the command
-  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
-
-  // what do we want pb_elems do with this block?
-  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
-
-  // compute offset from rolling base to get index into block pool ring allocation
-  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
-
-  // convert the pb_cmd's offset counter pb_elems a block id
-  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
-
-#if 0
-  if (get_sub_group_local_id() == 0) {
-    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
-    printf("< %8u >\n",block);
-  }
-#endif
-
-  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
-  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
-
-  // calculate bp_elems (to) / pb_elems (from)
-  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
-  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
-
-  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, segs\n",bp_off);
-#endif
-      skc_copy_segs(bp_elems,
-                    bp_elems_idx,
-                    pb_elems,
-                    pb_elems_idx);
-    }
-  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, NODE\n",bp_off);
-#endif
-      skc_copy_node(bp_elems, // to
-                    bp_elems_idx,
-                    bp_ids,
-                    bp_reads,
-                    bp_idx_mask,
-                    pb_elems, // from
-                    pb_elems_idx,
-                    pb_rolling);
-    }
-  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, HEAD\n",bp_off);
-#endif
-      skc_copy_head(host_map,
-                    block,
-                    bp_elems, // to
-                    bp_elems_idx,
-                    bp_ids,
-                    bp_reads,
-                    bp_idx_mask,
-                    pb_elems, // from
-                    pb_elems_idx,
-                    pb_rolling);
-    }
-}
-
-//
-//
-//
-
-__kernel
-SKC_PATHS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
-                       __global skc_uint          * const bp_alloc,
-                       skc_uint                     const bp_alloc_idx,
-                       skc_uint                     const pb_cmd_count)
-{
-  //
-  // allocate blocks in block pool
-  //
-  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
-
-  // store in slot
-  bp_alloc[bp_alloc_idx] = reads;
-
-#if 0
-  printf("pc: %8u + %u\n",reads,pb_cmd_count);
-#endif
-}
-
-//
-//
-//
diff --git a/src/compute/skc/paths_reclaim.cl b/src/compute/skc/paths_reclaim.cl
deleted file mode 100644
index 563160613c..0000000000
--- a/src/compute/skc/paths_reclaim.cl
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// FIXME -- a pre-allocation step could load the path header quads and
-// total up the number of blocks in the workgroup or subgroup
-// minimizing the number of later atomics adds.
-//
-
-#include "device_cl_12_gen9.h"
-#include "block_pool_cl.h"
-#include "atomic_cl.h"
-#include "block.h"
-#include "path.h"
-#include "common.h"
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
-
-//
-//
-//
-
-#if   ( SKC_PATHS_RECLAIM_X == 1 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_PATHS_RECLAIM_X == 2 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_PATHS_RECLAIM_X == 4 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_PATHS_RECLAIM_X == 8 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_PATHS_RECLAIM_X == 16)
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_PATHS_RECLAIM_X"
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
-  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
-  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
-  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
-
-#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
-  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
-  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
-   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
-  S = sub_group_scan_exclusive_add(C)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
-  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
-  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
-                         __global skc_uint                * const bp_elems,    // block pool blocks
-                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics
-                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
-                         __global skc_block_id_t const    * const map,         // path host-to-device map
-                         struct   skc_reclaim               const reclaim)     // array of host path ids
-{
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const reclaim_stride = get_num_sub_groups();
-#else
-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
-  //
-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
-  //
-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
-    {
-      // get host path id
-      skc_path_h const path = reclaim.aN[reclaim_idx];
-
-      // get the path header block from the map
-      skc_block_id_t   id   = map[path];
-
-      //
-      // blindly load all of the head elements into registers
-      //
-      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // pick out count.nodes and count.prims from the header
-      //
-      skc_uint count_blocks, count_nodes;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
-        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
-      }                                                                 \
-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
-        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-#if 0
-      if (get_sub_group_local_id() == 0) {
-        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
-      }
-#endif
-
-      //
-      // acquire a span in the block pool ids ring for reclaimed ids
-      //
-      // FIXME count_blocks and atomic add can be done in same lane
-      //
-      skc_uint bp_ids_base = 0;
-
-      if (get_sub_group_local_id() == 0) {
-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-
-#if 0
-        printf("paths: bp_ids_base = %u\n",bp_ids_base);
-#endif
-      }
-
-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
-      //
-      // shift away the tagged block id's tag
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
-        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // swap current id with next
-      //
-      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-        {
-          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-          id = next;
-        }
-
-      //
-      // - we'll skip subgroups that are entirely header
-      //
-      // - but we need to mark any header elements that partially fill
-      //   a subgroup as invalid tagged block ids
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
-        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
-          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
-            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
-          }                                             \
-        }                                               \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      {
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
-          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = h##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("P %7u ! %u\n",bp_ids_idx,h##I);
-      }
-
-      //
-      // we're done if it was just the header
-      //
-      if (count_nodes == 0)
-        return;
-
-      //
-      // otherwise, walk the nodes
-      //
-      do {
-        // id of next block is in last lane
-        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
-
-        // get index of each element
-        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-        //
-        // blindly load all of the node elements into registers
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // shift away the tagged block id's tag
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // swap current id with next
-        //
-        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-          {
-            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-            id = next;
-          }
-
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = n##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("P %7u ! %u\n",bp_ids_idx,n##I);
-
-        // any more nodes?
-      } while (--count_nodes > 0);
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/place.cl b/src/compute/skc/place.cl
deleted file mode 100644
index 00f16f7843..0000000000
--- a/src/compute/skc/place.cl
+++ /dev/null
@@ -1,871 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "raster.h"
-#include "tile.h"
-
-//
-//
-//
-
-#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
-#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
-
-//
-//
-//
-
-#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if   ( SKC_PLACE_X == 1 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
-#define SKC_PLACE_EXPAND_I_LAST      0
-
-#elif ( SKC_PLACE_X == 2 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
-#define SKC_PLACE_EXPAND_I_LAST      1
-
-#elif ( SKC_PLACE_X == 4 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
-#define SKC_PLACE_EXPAND_I_LAST      3
-
-#elif ( SKC_PLACE_X == 8 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
-#define SKC_PLACE_EXPAND_I_LAST      7
-
-#elif ( SKC_PLACE_X == 16)
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
-#define SKC_PLACE_EXPAND_I_LAST      15
-#endif
-
-//
-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
-// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
-//
-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
-// KERNELS USE DIFFERENT SUBGROUP SIZES.
-//
-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
-//
-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
-// ONLY SUPPORT A SUBGROUP SIZE OF 16.
-//
-
-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
-
-#define SKC_PLACE_STRIDE_H(L)              (L)
-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
-
-#define SKC_PLACE_STRIDE_H(L)              (L)
-#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
-
-#endif
-
-//
-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
-//
-
-#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
-
-#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-
-
-//
-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
-//
-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
-
-//
-// TTSK v2:
-//
-//  0                                       63
-//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
-//  +---------+--------+---------+-----+-----+
-//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
-//
-//
-// TTPK v2:
-//
-//  0                                    63
-//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
-//  +---------+--------+------+-----+-----+
-//  |    27   | 1 (=1) |  12  | 12  | 12  |
-//
-//
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   18  |  7  |  7  |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   15  |  9  |  8  |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          27          |    1   |    1   |   18  |  9  |  8  |
-//
-
-union skc_subgroup_smem
-{
-  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
-
-  struct {
-    struct {
-      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
-      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
-    } lo;
-
-    struct {
-      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
-      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
-    } hi;
-
-    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
-  };
-
-};
-
-//
-// scatter scan max
-//
-static
-skc_int_v_t
-skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
-                     skc_int_v_t                                 const iss,
-                     skc_int_v_t                                 const ess)
-{
-  //
-  // prefix sums determine which lanes we're going to work on next
-  //
-  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
-  skc_int_v_t  const scratch_idx      = max(ess,0);
-
-  //
-  // SIMT
-  //
-
-  //
-  // zero the volatile smem scratchpad using vector syntax
-  //
-  smem->scratch[get_sub_group_local_id()] = ( 0 );
-
-  //
-  // store source lane at starting lane
-  //
-  if (is_scratch_store) {
-    smem->scratch[scratch_idx] = get_sub_group_local_id();
-  }
-
-  //
-  // propagate lanes to right using max scan
-  //
-  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
-  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
-
-  return source;
-}
-
-//
-//
-//
-
-static
-skc_bool
-skc_xk_clip(union skc_tile_clip const * const tile_clip,
-            skc_ttxk_t                * const xk)
-{
-  //
-  // clip the sk and pk keys
-  //
-  // if fully clipped then return false
-  //
-  // alternatively -- we can expand all these keys in place
-  //
-  // alternatively -- keep sk and pk keys segregated because sk
-  // represents the vast majority of keys and are easier to process.
-  // don't mess with the fastpath!
-  //
-  return false;
-}
-
-//
-//
-//
-
-static
-skc_ttck_t
-skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
-             union skc_cmd_place              const    * const cmd,
-             skc_uint                                    const sk_idx)
-{
-  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
-  skc_uint const hi = smem->hi.sk[sk_idx];
-
-  skc_ttck_t ck;
-
-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
-  // FIXME -- x and y should already be clipped and shifted
-  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
-  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
-  return ck;
-}
-
-static
-skc_ttck_t
-skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
-             union skc_cmd_place              const    * const cmd,
-             skc_uint                                    const pk_idx,
-             skc_uint                                    const dx)
-{
-  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
-  skc_uint const hi = smem->hi.pk[pk_idx];
-
-  skc_ttck_t ck;
-
-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
-  // FIXME -- x and y should already be clipped and shifted
-  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
-  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
-  return ck;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
-               __global skc_ttck_t                       * const ck_extent,
-               __local union skc_subgroup_smem  volatile * const smem,
-               union skc_cmd_place              const    * const cmd,
-               skc_uint                         const            sk)
-{
-  //
-  // Pretty sure you can never ever have an sk count equal to 0
-  //
-  skc_uint ck_base = 0;
-
-  // last lane performs the block pool allocation with an atomic increment
-  if (get_sub_group_local_id() == 0) {
-    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
-  }
-
-  // broadcast base to all lanes
-  ck_base = sub_group_broadcast(ck_base,0);
-
-  // convert sk keys to ck keys
-  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
-    {
-      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
-    }
-}
-
-//
-//
-//
-
-static
-skc_int
-skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
-                  skc_uint                                    const idx)
-{
-  skc_uint const lo      = smem->lo.pk[idx];
-  skc_uint const hi      = smem->hi.pk[idx];
-
-  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
-  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
-
-  return (span_lo | span_hi) + 1;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
-               __global skc_ttck_t                       * const ck_extent,
-               __local union skc_subgroup_smem  volatile * const smem,
-               union skc_cmd_place              const    * const cmd,
-               skc_uint                         const            pk)
-{
-  // bail out if pk queue is empty
-  if (pk == 0)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("%u\n",pk);
-#endif
-
-  //
-  // FIXME -- this nested loop iterates over the queue processing a
-  // subgroup of 64-bit keys at a time.  This is probably not the most
-  // efficient approach so investigate how to store and iterate over a
-  // wider than subgroup (node-sized) queue of keys.
-  //
-
-  // round up so we work with full subgroups
-  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
-  skc_uint       ii    = 0;
-
-  // nested loop that expands all ttpk keys
-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
-  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
-#endif
-    {
-      skc_uint idx  = ii + get_sub_group_local_id();
-      skc_int  span = 0;
-
-      // how many tiles does this ttpk span?
-      if (idx < pk)
-        span = skc_ttpk_get_span(smem,idx);
-
-      // we need inclusive, exclusive and total
-      skc_int iss = sub_group_scan_inclusive_add(span);
-      skc_int ess = iss - span;
-      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
-
-      // printf("%u : %u\n",span,iss);
-      // continue;
-
-      // atomically allocate space for the pk keys
-      skc_uint ck_base = 0;
-
-      // last lane performs the block pool allocation with an atomic increment
-      if (get_sub_group_local_id() == 0) {
-        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
-      }
-
-      // broadcast atomically allocated extent base to all lanes
-      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
-
-      //
-      // FIXME -- this loop would probably be faster if the ttpk keys
-      // were held in registers and accessed with shuffles instead of
-      // SMEM loads
-      //
-
-      //
-      // loop until there are no more expanded pk keys
-      //
-      while (true)
-        {
-          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
-          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
-
-          // store valid ck keys to gmem
-          if (get_sub_group_local_id() < rem) {
-            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
-          }
-
-          // decrement remainder
-          rem -= SKC_PLACE_SUBGROUP_SIZE;
-
-          if (rem <= 0)
-            break;
-
-          // increment/decrement indices
-          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
-          iss    -= SKC_PLACE_SUBGROUP_SIZE;
-          ess    -= SKC_PLACE_SUBGROUP_SIZE;
-        }
-    }
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_ballot(skc_uint * const xk, skc_uint const is_xk)
-{
-#if 0
-  //
-  // FIXME -- when available, this should use the idiom:
-  //
-  //   ballot() + lane_mask_less_than_or_equal + popcount()
-  //
-  // Supported by:
-  //
-  //   - Vulkan 1.1 / SPIR-V 1.3
-  //   - CUDA
-  //   - AVX2 (SSE*?)
-  //
-#else
-  //
-  // otherwise, emulate with an inclusive scan (yuk)
-  //
-  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
-
-  skc_uint const xk_idx = *xk + prefix - is_xk;
-
-  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
-
-#if 0
-  printf("< %3u >\n",xk_idx);
-#endif
-
-  return xk_idx;
-#endif
-}
-
-//
-//
-//
-__kernel
-SKC_PLACE_KERNEL_ATTRIBS
-void
-skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
-                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
-                 __global skc_ttck_t                   * const ck_extent,
-                 __global union skc_cmd_place const    * const cmds,
-                 __global skc_block_id_t               * const map,
-                 skc_uint4                               const clip,
-                 skc_uint                                const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
-  __local union skc_subgroup_smem  volatile                smem[1];
-#else
-  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
-  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
-  //
-  // This is a subgroup-centric kernel
-  //
-  // Which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-  // Test the raster's translated bounds against the composition's
-  // tile clip
-  //
-  // There are 3 cases:
-  //
-  //   - the raster is completely clipped -> return
-  //   - the raster is partially  clipped -> all keys must clipped
-  //   - the raster is not        clipped -> no keys are tested
-  //
-  //
-  // There are at least 4 implementations of place and we want to
-  // special-case them as much as possible so that, at the least, the
-  // fastpath remains fast.
-  //
-  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
-  //
-  //  - implement CLIPPED + NO TRANSLATION path
-  //
-  //  - implement NO CLIP +    TRANSLATION path
-  //
-  //  - implement CLIPPED +    TRANSLATION path
-  //
-  //
-  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
-  // 12:12:8 integer where:
-  //
-  //  12: ttsk
-  //  12: ttpk
-  //   8: /dev/null -- clipped or invalid key
-  //
-  // Three kinds of nodes in a raster's list:
-  //
-  //  - the head node
-  //  - an internal node
-  //  - the final node
-  //
-
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
-  skc_uint const cmd_idx = get_group_id(0);
-#else
-  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  // load command
-  union skc_cmd_place const cmd = cmds[cmd_idx];
-
-  // get the raster header from the raster host id -- scalar
-  skc_block_id_t            id  = map[cmd.raster_h];
-
-  //
-  // load all of the head block ttxk keys into registers
-  //
-  // FIXME -- this pattern lends itself to using the higher
-  // performance Intel GEN block load instructions
-  //
-  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-  union skc_raster_node_elem const h##I = {                     \
-    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
-               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
-  };
-
-  SKC_PLACE_EXPAND();
-
-  //
-  // load raster header counts -- we only need the "nodes" and "keys"
-  // words but the keys we loaded are doublewords.
-  //
-  // FIXME -- this can be made portable with compile-time macro expansion
-  //
-  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
-  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-  //
-  //
-  //
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
-         nodes,keys,                                            \
-         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
-         h##I.u32v2.hi,h##I.u32v2.lo,                           \
-         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
-  SKC_PLACE_EXPAND();
-#endif
-
-  //
-#if 0
-  if (get_sub_group_local_id() == 0) {
-    printf("place: %u / %u / %u\n",head_id,nodes,keys);
-  }
-#endif
-
-  {
-    //
-    // classify every key in the header
-    //
-    // keys: 0 is not a key / 1 is a key
-    // skpk: 0 is sk        / 1 is pk
-    //
-    skc_uint bits_keys = 0;
-    skc_uint bits_skpk = 0;
-
-    //
-    // calculate bits_keys
-    //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
-      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
-      if (idx < keys) {                                                 \
-        bits_keys |= (1u << I);                                         \
-      }                                                                 \
-      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
-        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
-          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
-            bits_keys &= ~(1u << I);                                    \
-          }                                                             \
-        }                                                               \
-      }                                                                 \
-    }
-
-    SKC_PLACE_EXPAND();
-
-    //
-    // blindly calculate bits_skpk
-    //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
-      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
-    }
-
-    SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
-    //
-    // next pointer is last element of last row.  save it now because
-    // this might be recognized as a subgroup-uniform/scalar.
-    //
-    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
-    //
-    // append SK keys first
-    //
-    skc_uint const bits_sk = bits_keys & ~bits_skpk;
-    skc_uint       sk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
-      skc_uint is_sk  = (bits_sk >> I) & 1;     \
-      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
-      if (is_sk) {                              \
-        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
-        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
-      }                                         \
-    }
-
-    SKC_PLACE_EXPAND();
-
-    //
-    // append PK keys next
-    //
-    skc_uint const bits_pk = bits_keys & bits_skpk;
-    skc_uint       pk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
-      skc_uint is_pk  = (bits_pk >> I) & 1;     \
-      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
-      if (is_pk) {                              \
-        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
-        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
-      }                                         \
-    }
-
-    SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2u * %2u\n",sk,pk);
-#endif
-    //
-    // flush the keys
-    //
-    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
-    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-  }
-
-  //
-  // we're done if there was only a head node
-  //
-  if (nodes == 0)
-    return;
-
-  //
-  // decrement keys
-  //
-  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
-
-  //
-  // otherwise, append keys in trailing nodes to smem
-  //
-  while (true)
-    {
-      //
-      // load all of the node block ttxk keys into registers
-      //
-      // FIXME -- this pattern lends itself to using the higher
-      // performance Intel GEN block load instructions
-      //
-      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      union skc_raster_node_elem const n##I = {                         \
-        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
-                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
-      };
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
-             nodes,keys,                                                \
-             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
-             n##I.u32v2.hi,n##I.u32v2.lo,                               \
-             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
-      SKC_PLACE_EXPAND();
-#endif
-
-      //
-      // classify every key in the header
-      //
-      // keys: 0 is not a key / 1 is a key
-      // skpk: 0 is sk        / 1 is pk
-      //
-      skc_uint bits_keys = 0;
-      skc_uint bits_skpk = 0;
-
-      //
-      // calculate bits_keys
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
-        if (idx < keys) {                                               \
-          bits_keys |= (1u << I);                                       \
-        }                                                               \
-        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
-          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
-            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
-              bits_keys &= ~(1u << I);                                  \
-            }                                                           \
-          }                                                             \
-        }                                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-      //
-      // blindly calculate bits_skpk
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
-      }
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-      printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
-      //
-      // next pointer is last element of last row.  save it now because
-      // this might be recognized as a subgroup-uniform/scalar.
-      //
-      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
-      //
-      // append SK keys first
-      //
-      skc_uint const bits_sk = bits_keys & ~bits_skpk;
-      skc_uint       sk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                       \
-        skc_uint is_sk  = (bits_sk >> I) & 1;           \
-        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
-        if (is_sk) {                                    \
-          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
-          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
-        }                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-      //
-      // append PK keys next
-      //
-      skc_uint const bits_pk = bits_keys & bits_skpk;
-      skc_uint       pk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                       \
-        skc_uint is_pk  = (bits_pk >> I) & 1;           \
-        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
-        if (is_pk) {                                    \
-          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
-          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
-        }                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2u * %2u\n",sk,pk);
-#endif
-      //
-      // if total for either the sk or pk queue reaches the
-      // highwater mark then flush it to the extent
-      //
-      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
-      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-
-      //
-      // if this was the last node then we're done
-      //
-      if (--nodes == 0)
-        return;
-
-      //
-      // otherwise decrement keys
-      //
-      keys -= SKC_RASTER_NODE_COUNT_KEYS;
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
new file mode 100644
index 0000000000..aa44f36e87
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "runtime_cl_12.h"
+#include "config_cl.h"
+#include "common/cl/assert_cl.h"
+
+//
+// PERM
+//
+
+cl_mem
+skc_runtime_device_perm_alloc(struct skc_runtime * const runtime,
+                              cl_mem_flags         const flags,
+                              size_t               const size)
+{
+  cl_int cl_err;
+
+  cl_mem mem = clCreateBuffer(runtime->cl.context,
+                              flags,
+                              size,
+                              NULL,
+                              &cl_err); cl_ok(cl_err);
+  return mem;
+}
+
+void
+skc_runtime_device_perm_free(struct skc_runtime * const runtime,
+                             cl_mem               const mem)
+{
+  cl(ReleaseMemObject(mem));
+}
+
+//
+// TEMP
+//
+
+cl_mem
+skc_runtime_device_temp_alloc(struct skc_runtime * const runtime,
+                              cl_mem_flags         const flags,
+                              size_t               const size,
+                              skc_subbuf_id_t    * const subbuf_id,
+                              size_t             * const subbuf_size)
+{
+  if (size == 0)
+    {
+      *subbuf_id = (skc_subbuf_id_t)-1;
+
+      if (subbuf_size != NULL)
+        *subbuf_size = 0;
+      
+      return NULL;
+    }
+
+  cl_buffer_region br;
+
+  br.origin = skc_suballocator_subbuf_alloc(&runtime->allocator.device.temp.suballocator,
+                                            runtime->scheduler,
+                                            size,subbuf_id,&br.size);
+
+  if (subbuf_size != NULL)
+    *subbuf_size = br.size;
+
+  cl_int cl_err;
+
+  cl_mem mem = clCreateSubBuffer(runtime->allocator.device.temp.extent,
+                                 flags,
+                                 CL_BUFFER_CREATE_TYPE_REGION,
+                                 &br,
+                                 &cl_err); cl_ok(cl_err);
+
+  return mem;
+}
+
+
+void
+skc_runtime_device_temp_free(struct skc_runtime * const runtime, 
+                             cl_mem               const mem,
+                             skc_subbuf_id_t      const subbuf_id)
+{
+  if (mem == NULL)
+    return;
+
+  skc_suballocator_subbuf_free(&runtime->allocator.device.temp.suballocator,subbuf_id);
+
+  cl(ReleaseMemObject(mem));  
+}
+
+//
+//
+//
+
+void
+skc_allocator_device_create(struct skc_runtime * const runtime)
+{
+  skc_suballocator_create(runtime,
+                          &runtime->allocator.device.temp.suballocator,
+                          "DEVICE",
+                          runtime->config->suballocator.device.subbufs,
+                          runtime->cl.base_align,
+                          runtime->config->suballocator.device.size);
+
+#ifndef NDEBUG
+#pragma message("Get rid of CL_MEM_ALLOC_HOST_PTR as soon as the sorter is installed")
+  cl_mem_flags const flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+#else
+  cl_mem_flags const flags = CL_MEM_READ_WRITE;
+#endif
+
+  runtime->allocator.device.temp.extent = 
+    skc_runtime_device_perm_alloc(runtime,
+                                  flags,
+                                  runtime->config->suballocator.device.size);
+}
+
+void
+skc_allocator_device_dispose(struct skc_runtime * const runtime)
+{
+  skc_suballocator_dispose(runtime,&runtime->allocator.device.temp.suballocator);
+
+  skc_runtime_device_perm_free(runtime,runtime->allocator.device.temp.extent);
+}
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.h b/src/compute/skc/platforms/cl_12/allocator_device_cl.h
new file mode 100644
index 0000000000..67d4e41398
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <CL/opencl.h>
+
+//
+//
+//
+
+#include "suballocator.h"
+
+//
+//
+//
+
+struct skc_allocator_device
+{
+#if 0
+  struct {
+    
+  } perm;
+#endif
+
+  struct {
+    struct skc_suballocator suballocator;
+    cl_mem                  extent;
+  } temp;
+};
+
+//
+//
+//
+
+void
+skc_allocator_device_create(struct skc_runtime * const runtime);
+
+void
+skc_allocator_device_dispose(struct skc_runtime * const runtime);
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/atomic_cl.h b/src/compute/skc/platforms/cl_12/atomic_cl.h
new file mode 100644
index 0000000000..c196c36390
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/atomic_cl.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_ATOMIC_CL
+#define SKC_ONCE_ATOMIC_CL
+
+//
+// git cl upload is bleating about needing an #include before and #if
+// so we're unneccesarily reloading the types and OpenCL header
+//
+
+#include "types.h"
+
+#if (__OPENCL_C_VERSION__ <= 120 /*CL_VERSION_1_2*/)
+
+#define SKC_ATOMIC_UINT                             uint
+#define SKC_ATOMIC_INT                              int
+
+#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v)    atomic_add(p,v)
+#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v)  atomic_add(p,v)
+
+#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v)   atomic_add(p,v)
+#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_add(p,v)
+
+#else // __OPENCL_C_VERSION__ > __CL_VERSION_1_2
+
+//
+// REMOVE THESE DEFINES ASAP -- ONLY HERE BECAUSE THE INTEL CODE
+// BUILDER UTILITY DOESN'T SUPPORT CREATING AN ATOMIC TYPE BUFFER
+//
+
+#ifdef SKC_SUPPORT_BROKEN_INTEL_CODE_BUILDER
+
+#define SKC_ATOMIC_UINT                             uint
+#define SKC_ATOMIC_CAST_LOCAL(p)                    (__local  atomic_uint volatile * restrict const)(p)
+#define SKC_ATOMIC_CAST_GLOBAL(p)                   (__global atomic_uint volatile * restrict const)(p)
+
+#else
+
+#define SKC_ATOMIC_UINT                             atomic_uint
+#define SKC_ATOMIC_CAST_LOCAL(p)                    (p)
+#define SKC_ATOMIC_CAST_GLOBAL(p)                   (p)
+
+#endif
+
+
+#define SKC_ATOMIC_ADD_LOCAL_RELAXED_DEVICE(p,v)    atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \
+                                                                              v,memory_order_relaxed,memory_scope_device)
+#define SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(p,v)  atomic_fetch_add_explicit(SKC_ATOMIC_CAST_LOCAL(p), \
+                                                                              v,memory_order_relaxed,memory_scope_sub_group)
+
+#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(p,v)   atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \
+                                                                              v,memory_order_relaxed,memory_scope_device)
+#define SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(p,v) atomic_fetch_add_explicit(SKC_ATOMIC_CAST_GLOBAL(p), \
+                                                                              v,memory_order_relaxed,memory_scope_sub_group)
+
+#endif
+
+//
+//
+//
+
+#endif // SKC_ONCE_ATOMIC_CL
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/block_pool_cl.h b/src/compute/skc/platforms/cl_12/block_pool_cl.h
new file mode 100644
index 0000000000..c88370919e
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/block_pool_cl.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_BLOCK_POOL
+#define SKC_ONCE_BLOCK_POOL
+
+//
+//
+//
+
+#include "types.h"
+
+//
+//
+//
+
+union skc_block_pool_size
+{
+  skc_uint3   u32v3;
+
+  struct {
+    skc_uint  pool_size; // number of blocks
+    skc_uint  ring_pow2; // rounded-up pow2 of pool_size
+    skc_uint  ring_mask; // ring_pow2 - 1
+  };
+};
+
+//
+//
+//
+
+union skc_block_pool_atomic
+{
+  skc_uint2  u32v2;
+
+  skc_uint   u32a2[2];
+
+  struct {
+    skc_uint reads;
+    skc_uint writes;
+  };
+};
+
+#define SKC_BP_ATOMIC_OFFSET_READS   0
+#define SKC_BP_ATOMIC_OFFSET_WRITES  1
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/block_pool_cl_12.h b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h
new file mode 100644
index 0000000000..6fa8a39ca0
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/block_pool_cl_12.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "block_pool_cl.h"
+#include "extent_cl_12.h"
+
+//
+// device side block pool
+//
+
+struct skc_block_pool
+{
+  union  skc_block_pool_size const * size;
+
+  struct skc_extent_pdrw             blocks;
+  struct skc_extent_pdrw             ids;
+  struct skc_extent_phr_pdrw         atomics;
+};
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.c b/src/compute/skc/platforms/cl_12/composition_cl_12.c
new file mode 100644
index 0000000000..7853564636
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/composition_cl_12.c
@@ -0,0 +1,823 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "hs/cl/hs_cl_launcher.h"
+
+#include "common/cl/assert_cl.h"
+
+#include "composition_cl_12.h"
+#include "config_cl.h"
+
+#include "context.h"
+#include "raster.h"
+#include "handle.h"
+
+#include "runtime_cl_12.h"
+
+#include "common.h"
+#include "tile.h"
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+union skc_ttck
+{
+  skc_ulong   u64;
+  skc_uint2   u32v2;
+
+  struct {
+    skc_uint  id         : SKC_TTCK_LO_BITS_ID;
+    skc_uint  prefix     : SKC_TTCK_LO_BITS_PREFIX;
+    skc_uint  escape     : SKC_TTCK_LO_BITS_ESCAPE;
+    skc_uint  layer_lo   : SKC_TTCK_LO_BITS_LAYER;
+    skc_uint  layer_hi   : SKC_TTCK_HI_BITS_LAYER;
+    skc_uint  x          : SKC_TTCK_HI_BITS_X;
+    skc_uint  y          : SKC_TTCK_HI_BITS_Y;
+  };
+
+  struct {
+    skc_ulong na0        : SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE;
+    skc_ulong layer      : SKC_TTCK_BITS_LAYER;
+    skc_ulong na1        : SKC_TTCK_HI_BITS_YX;
+  };
+
+  struct {
+    skc_uint  na2;
+    skc_uint  na3        : SKC_TTCK_HI_BITS_LAYER;
+    skc_uint  yx         : SKC_TTCK_HI_BITS_YX;
+  };
+};
+
+//
+// FIXME -- accept floats on host but convert to subpixel offsets
+// before appending to command ring
+//
+
+#define SKC_PLACE_CMD_TX_CONVERT(f)  0
+#define SKC_PLACE_CMD_TY_CONVERT(f)  0
+
+//
+// COMPOSITION PLACE
+//
+// This is a snapshot of the host-side command queue.
+//
+// Note that the composition command extent could be implemented as
+// either a mapped buffer or simply copied to an ephemeral extent.
+//
+// This implementation may vary between compute platforms.
+//
+
+struct skc_composition_place
+{
+  struct skc_composition_impl      * impl;
+
+  cl_command_queue                   cq;
+
+  struct skc_extent_phw1g_tdrNs_snap cmds;
+
+  skc_subbuf_id_t                    id;
+};
+
+//
+// Forward declarations
+//
+
+static
+void
+skc_composition_unseal_block(struct skc_composition_impl * const impl, 
+                             skc_bool                      const block);
+
+//
+//
+//
+
+static
+void
+skc_composition_pfn_release(struct skc_composition_impl * const impl)
+{
+  if (--impl->composition->ref_count != 0)
+    return;
+
+  //
+  // otherwise, dispose of all resources
+  //
+
+  // the unsealed state is a safe state to dispose of resources
+  skc_composition_unseal_block(impl,true); // block
+
+  struct skc_runtime * const runtime = impl->runtime;
+  
+  // free host composition
+  skc_runtime_host_perm_free(runtime,impl->composition);
+
+  // release the cq
+  skc_runtime_release_cq_in_order(runtime,impl->cq);
+
+  // release kernels
+  cl(ReleaseKernel(impl->kernels.place));
+  cl(ReleaseKernel(impl->kernels.segment));  
+
+  // release extents
+  skc_extent_phw1g_tdrNs_free(runtime,&impl->cmds.extent);
+  skc_extent_phrw_free       (runtime,&impl->saved.extent);
+  skc_extent_phr_pdrw_free   (runtime,&impl->atomics);
+
+  skc_extent_pdrw_free       (runtime,&impl->keys);
+  skc_extent_pdrw_free       (runtime,&impl->offsets);
+      
+  // free composition impl
+  skc_runtime_host_perm_free(runtime,impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_place_grid_pfn_dispose(skc_grid_t const grid)
+{
+  struct skc_composition_place * const place   = skc_grid_get_data(grid);
+  struct skc_composition_impl  * const impl    = place->impl;
+  struct skc_runtime           * const runtime = impl->runtime;
+
+  // release cq
+  skc_runtime_release_cq_in_order(runtime,place->cq);
+
+  // unmap the snapshot (could be a copy)
+  skc_extent_phw1g_tdrNs_snap_free(runtime,&place->cmds);
+
+  // release place struct
+  skc_runtime_host_temp_free(runtime,place,place->id);
+
+  // release impl
+  skc_composition_pfn_release(impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_place_read_complete(skc_grid_t const grid)
+{
+  skc_grid_complete(grid);
+}
+
+static
+void
+skc_composition_place_read_cb(cl_event event, cl_int status, skc_grid_t const grid)
+{
+  SKC_CL_CB(status);
+  
+  struct skc_composition_place * const place     = skc_grid_get_data(grid);
+  struct skc_composition_impl  * const impl      = place->impl;
+  struct skc_runtime           * const runtime   = impl->runtime;
+  struct skc_scheduler         * const scheduler = runtime->scheduler;
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(scheduler,skc_composition_place_read_complete,grid);
+}
+
+static
+void
+skc_composition_place_grid_pfn_execute(skc_grid_t const grid)
+{
+  //
+  // FILLS EXPAND
+  //
+  // need result of cmd counts before launching RASTERIZE grids
+  //
+  // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host
+  // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device
+  // - or launch a device-wide grid that feeds itself but that's unsatisfying
+  //
+  struct skc_composition_place * const place   = skc_grid_get_data(grid);
+  struct skc_composition_impl  * const impl    = place->impl;
+  struct skc_runtime           * const runtime = impl->runtime;
+
+  skc_uint  const work_size = skc_extent_ring_snap_count(place->cmds.snap);
+  skc_uint4 const clip      = { 0, 0, SKC_UINT_MAX, SKC_UINT_MAX };
+
+  // initialize kernel args
+  cl(SetKernelArg(impl->kernels.place,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+  cl(SetKernelArg(impl->kernels.place,1,SKC_CL_ARG(impl->atomics.drw)));
+  cl(SetKernelArg(impl->kernels.place,2,SKC_CL_ARG(impl->keys.drw)));
+  cl(SetKernelArg(impl->kernels.place,3,SKC_CL_ARG(place->cmds.drN)));
+  cl(SetKernelArg(impl->kernels.place,4,SKC_CL_ARG(runtime->handle_pool.map.drw)));
+  cl(SetKernelArg(impl->kernels.place,5,SKC_CL_ARG(clip))); // FIXME -- convert the clip to yx0/yx1 format
+  cl(SetKernelArg(impl->kernels.place,6,SKC_CL_ARG(work_size)));
+
+  // launch kernel
+  skc_device_enqueue_kernel(runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PLACE,
+                            place->cq,
+                            impl->kernels.place,
+                            work_size,
+                            0,NULL,NULL);
+  //
+  // copy atomics back after every place launch
+  //
+  cl_event complete;
+
+  skc_extent_phr_pdrw_read(&impl->atomics,place->cq,&complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_place_read_cb,grid));
+  cl(ReleaseEvent(complete));
+
+  // flush command queue
+  cl(Flush(place->cq));
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_snap(struct skc_composition_impl * const impl)
+{
+  skc_composition_retain(impl->composition);
+
+  skc_subbuf_id_t id;
+
+  struct skc_composition_place * const place = skc_runtime_host_temp_alloc(impl->runtime,
+                                                                           SKC_MEM_FLAGS_READ_WRITE,
+                                                                           sizeof(*place),&id,NULL);
+
+  // save the subbuf id
+  place->id = id;
+
+  // save backpointer
+  place->impl = impl;
+
+  // set grid data
+  skc_grid_set_data(impl->grids.place,place);
+  
+  // acquire command queue
+  place->cq = skc_runtime_acquire_cq_in_order(impl->runtime);
+
+  // checkpoint the ring
+  skc_extent_ring_checkpoint(&impl->cmds.ring);
+
+  // make a snapshot
+  skc_extent_phw1g_tdrNs_snap_init(impl->runtime,&impl->cmds.ring,&place->cmds);
+  
+  // unmap the snapshot (could be a copy)
+  skc_extent_phw1g_tdrNs_snap_alloc(impl->runtime,
+                                    &impl->cmds.extent,
+                                    &place->cmds,
+                                    place->cq,
+                                    NULL);
+
+  skc_grid_force(impl->grids.place);
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_pfn_seal(struct skc_composition_impl * const impl)
+{
+  // return if sealing or sealed
+  if (impl->state >= SKC_COMPOSITION_STATE_SEALING)
+    return;
+
+  struct skc_runtime   * const runtime   = impl->runtime;
+  struct skc_scheduler * const scheduler = runtime->scheduler;
+
+  //
+  // otherwise, wait for UNSEALING > UNSEALED transition
+  //
+  if (impl->state == SKC_COMPOSITION_STATE_UNSEALING)
+    {
+      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED);
+    }
+  else // or we were already unsealed
+    {
+      // flush is there is work in progress
+      skc_uint const count = skc_extent_ring_wip_count(&impl->cmds.ring);
+
+      if (count > 0) {
+        skc_composition_snap(impl);
+      }
+    }
+
+  //
+  // now unsealed so we need to start sealing...
+  //
+  impl->state = SKC_COMPOSITION_STATE_SEALING;
+
+  //
+  // the seal operation implies we should force start all dependencies
+  // that are still in a ready state
+  //
+  skc_grid_force(impl->grids.sort);
+}
+
+//
+//
+//
+
+void
+skc_composition_sort_execute_complete(struct skc_composition_impl * const impl)
+{
+  // we're sealed
+  impl->state = SKC_COMPOSITION_STATE_SEALED;
+
+  // this grid is done
+  skc_grid_complete(impl->grids.sort);
+}
+
+static
+void
+skc_composition_sort_execute_cb(cl_event event, cl_int status, struct skc_composition_impl * const impl)
+{
+  SKC_CL_CB(status);
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_composition_sort_execute_complete,impl);
+}
+
+static
+void
+skc_composition_sort_grid_pfn_execute(skc_grid_t const grid)
+{
+  struct skc_composition_impl * const impl = skc_grid_get_data(grid);
+
+  // we should be sealing 
+  assert(impl->state == SKC_COMPOSITION_STATE_SEALING);
+
+  struct skc_place_atomics * const atomics = impl->atomics.hr;
+
+#ifndef NDEBUG
+  fprintf(stderr,"composition sort: %u\n",atomics->keys);
+#endif
+
+  if (atomics->keys > 0)
+    {
+      uint32_t keys_padded_in, keys_padded_out;
+
+      hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
+
+      hs_sort(impl->cq,
+              impl->keys.drw,
+              impl->keys.drw,
+              atomics->keys,
+              keys_padded_in,
+              keys_padded_out,
+              false);
+
+      cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw)));
+      cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw)));
+      cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw)));
+
+      // find start of each tile
+      skc_device_enqueue_kernel(impl->runtime->device,
+                                SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK,
+                                impl->cq,
+                                impl->kernels.segment,
+                                atomics->keys,
+                                0,NULL,NULL);
+    }
+
+  cl_event complete;
+
+  // next stage needs to know number of key segments
+  skc_extent_phr_pdrw_read(&impl->atomics,impl->cq,&complete);
+  
+  // register a callback
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_composition_sort_execute_cb,impl));
+  cl(ReleaseEvent(complete));
+
+  // flush cq
+  cl(Flush(impl->cq));
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_raster_release(struct skc_composition_impl * const impl)
+{
+  //
+  // reference counts to rasters can only be released when the
+  // composition is unsealed and the atomics are reset.
+  //
+  skc_runtime_raster_device_release(impl->runtime,
+                                    impl->saved.extent.hrw,
+                                    impl->saved.count);
+  // reset count
+  impl->saved.count = 0;
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_unseal_block(struct skc_composition_impl * const impl, 
+                             skc_bool                      const block)
+{
+  // return if already unsealed
+  if (impl->state == SKC_COMPOSITION_STATE_UNSEALED)
+    return;
+
+  //
+  // otherwise, we're going to need to pump the scheduler
+  //
+  struct skc_scheduler * const scheduler = impl->runtime->scheduler;
+
+  //
+  // wait for UNSEALING > UNSEALED transition
+  //
+  if (impl->state == SKC_COMPOSITION_STATE_UNSEALING)
+    {
+      if (block) {
+        SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_UNSEALED);
+      }
+      return;
+    }
+
+  //
+  // wait for SEALING > SEALED transition ...
+  //
+  if (impl->state == SKC_COMPOSITION_STATE_SEALING)
+    {
+      // wait if sealing
+      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_COMPOSITION_STATE_SEALED);
+    }
+
+  // wait for rendering locks to be released
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0);
+
+  //
+  // no need to visit UNSEALING state with this implementation
+  //
+
+  // acquire a new grid
+  impl->grids.sort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                                          NULL,  // the composition state guards this
+                                          impl,
+                                          NULL,  // no waiting
+                                          skc_composition_sort_grid_pfn_execute,
+                                          NULL); // no dispose
+
+  // mark composition as unsealed
+  impl->state = SKC_COMPOSITION_STATE_UNSEALED;
+}
+
+//
+// can only be called on a composition that was just unsealed
+//
+static
+void
+skc_composition_reset(struct skc_composition_impl * const impl)
+{
+  // zero the atomics
+  skc_extent_phr_pdrw_zero(&impl->atomics,impl->cq,NULL);
+
+  // flush it
+  cl(Flush(impl->cq));
+
+  // release all the rasters
+  skc_composition_raster_release(impl);
+}
+
+static
+void
+skc_composition_unseal_block_reset(struct skc_composition_impl * const impl, 
+                                   skc_bool                      const block,
+                                   skc_bool                      const reset)
+{
+  skc_composition_unseal_block(impl,block);
+
+  if (reset) {
+    skc_composition_reset(impl);
+  }
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_pfn_unseal(struct skc_composition_impl * const impl, skc_bool const reset)
+{
+  skc_composition_unseal_block_reset(impl,false,reset);
+}
+
+//
+// only needs to create a grid
+//
+
+static
+void
+skc_composition_place_create(struct skc_composition_impl * const impl)
+{
+  // acquire a grid
+  impl->grids.place = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                                           &impl->grids.place,
+                                           NULL,
+                                           NULL, // no waiting
+                                           skc_composition_place_grid_pfn_execute,
+                                           skc_composition_place_grid_pfn_dispose);
+
+  // assign happens-after relationship
+  skc_grid_happens_after_grid(impl->grids.sort,impl->grids.place);
+}
+
+
+static
+skc_err
+skc_composition_pfn_place(struct skc_composition_impl * const impl,
+                          skc_raster_t          const *       rasters,
+                          skc_layer_id          const *       layer_ids,
+                          skc_float             const *       txs,
+                          skc_float             const *       tys,
+                          skc_uint                            count)
+{
+  // block and yield if not unsealed
+  skc_composition_unseal_block(impl,true);
+
+  //
+  // validate and retain all rasters
+  //
+  skc_err err;
+
+  err = skc_runtime_handle_device_validate_retain(impl->runtime,
+                                                  SKC_TYPED_HANDLE_TYPE_IS_RASTER,
+                                                  rasters,
+                                                  count);
+  if (err) 
+    return err;
+
+  skc_runtime_handle_device_retain(impl->runtime,rasters,count);
+
+  //
+  // save the stripped handles
+  //
+  skc_raster_t * saved = impl->saved.extent.hrw;
+
+  saved             += impl->saved.count;
+  impl->saved.count += count;
+
+  for (skc_uint ii=0; ii<count; ii++) {
+    saved[ii] = SKC_TYPED_HANDLE_TO_HANDLE(*rasters++);
+  }
+
+  //
+  // - declare the place grid happens after the raster
+  // - copy place commands into ring
+  //
+  do {
+    skc_uint rem;
+
+    // find out how much room is left in then ring's snap    
+    // if the place ring is full -- let it drain
+    SKC_SCHEDULER_WAIT_WHILE(impl->runtime->scheduler,(rem = skc_extent_ring_wip_rem(&impl->cmds.ring)) == 0);
+
+    // append commands
+    skc_uint avail = min(rem,count);
+
+    // decrement count
+    count -= avail;
+
+    // launch a place kernel after copying commands?
+    skc_bool const is_wip_full = (avail == rem);
+
+    // if there is no place grid then create one
+    if (impl->grids.place == NULL)
+      {
+        skc_composition_place_create(impl);
+      }
+
+    //
+    // FIXME -- OPTIMIZATION? -- the ring_wip_index_inc() test can
+    // be avoided by splitting into at most two intervals. It should
+    // be plenty fast as is though so leave for now.
+    //
+    union skc_cmd_place * const cmds = impl->cmds.extent.hw1;
+
+    if ((txs == NULL) && (tys == NULL))
+      {
+        while (avail-- > 0)
+          {
+            skc_raster_t const raster = *saved++;
+
+            skc_grid_happens_after_handle(impl->grids.place,raster);
+
+            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
+              (union skc_cmd_place){ raster, *layer_ids++, 0, 0 };
+          }
+      }
+    else if (txs == NULL)
+      {
+        while (avail-- > 0)
+          {
+            skc_raster_t const raster = *saved++;
+
+            skc_grid_happens_after_handle(impl->grids.place,raster);
+
+            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
+              (union skc_cmd_place){ raster,
+                                     *layer_ids++,
+                                     0,
+                                     SKC_PLACE_CMD_TY_CONVERT(*tys++) };
+          }
+      }
+    else if (tys == NULL)
+      {
+        while (avail-- > 0)
+          {
+            skc_raster_t const raster = *saved++;
+
+            skc_grid_happens_after_handle(impl->grids.place,raster);
+
+            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
+              (union skc_cmd_place){ raster,
+                                     *layer_ids++,
+                                     SKC_PLACE_CMD_TX_CONVERT(*txs++),
+                                     0 };
+          }
+      }
+    else
+      {
+        while (avail-- > 0)
+          {
+            skc_raster_t const raster = *saved++;
+
+            skc_grid_happens_after_handle(impl->grids.place,raster);
+
+            cmds[skc_extent_ring_wip_index_inc(&impl->cmds.ring)] =
+              (union skc_cmd_place){ raster,
+                                     *layer_ids++,
+                                     SKC_PLACE_CMD_TX_CONVERT(*txs++),
+                                     SKC_PLACE_CMD_TY_CONVERT(*tys++) };
+          }
+      }
+
+    // launch place kernel?
+    if (is_wip_full) {
+      skc_composition_snap(impl);
+    }
+  } while (count > 0);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
+
+static
+void
+skc_composition_pfn_bounds(struct skc_composition_impl * const impl, skc_int bounds[4])
+{
+  //
+  // FIXME -- not implemented yet
+  //
+  // impl bounds will be copied back after sealing
+  //
+  bounds[0] = SKC_INT_MIN;
+  bounds[1] = SKC_INT_MIN;
+  bounds[2] = SKC_INT_MAX;
+  bounds[3] = SKC_INT_MAX;
+}
+
+//
+//
+//
+
+void
+skc_composition_retain_and_lock(struct skc_composition * const composition)
+{
+  skc_composition_retain(composition);
+
+  composition->impl->lock_count += 1;
+}
+
+void
+skc_composition_unlock_and_release(struct skc_composition * const composition)
+{
+  composition->impl->lock_count -= 1;
+
+  skc_composition_pfn_release(composition->impl);
+}
+
+//
+//
+//
+
+skc_err
+skc_composition_cl_12_create(struct skc_context       * const context,
+                             struct skc_composition * * const composition)
+{
+  struct skc_runtime * const runtime = context->runtime;
+
+  // retain the context
+  // skc_context_retain(context);
+
+  // allocate impl
+  struct skc_composition_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  // allocate composition
+  (*composition)            = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**composition));
+
+  (*composition)->context   = context;
+  (*composition)->impl      = impl;
+  (*composition)->ref_count = 1;
+
+  (*composition)->place     = skc_composition_pfn_place;
+  (*composition)->unseal    = skc_composition_pfn_unseal;
+  (*composition)->seal      = skc_composition_pfn_seal;
+  (*composition)->bounds    = skc_composition_pfn_bounds;
+  (*composition)->release   = skc_composition_pfn_release;
+
+  // intialize impl
+  impl->composition   = (*composition);
+  impl->runtime       = runtime;
+
+  SKC_ASSERT_STATE_INIT(impl,SKC_COMPOSITION_STATE_SEALED);
+
+  impl->lock_count    = 0;
+
+  impl->grids.sort    = NULL;
+  impl->grids.place   = NULL;
+
+  // acquire command queue for sealing/unsealing
+  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
+
+  // acquire kernels
+  impl->kernels.place   = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PLACE);
+  impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK);
+
+  // get config
+  struct skc_config const * const config = runtime->config;
+
+  // initialize ring size with config values
+  skc_extent_ring_init(&impl->cmds.ring,
+                       config->composition.cmds.elem_count,
+                       config->composition.cmds.snap_count,
+                       sizeof(union skc_cmd_place));
+
+  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->cmds.extent ,sizeof(union skc_cmd_place) * config->composition.cmds.elem_count);
+  skc_extent_phrw_alloc       (runtime,&impl->saved.extent,sizeof(skc_raster_t)        * config->composition.raster_ids.elem_count);
+  skc_extent_phr_pdrw_alloc   (runtime,&impl->atomics     ,sizeof(struct skc_place_atomics));
+
+  skc_extent_pdrw_alloc       (runtime,&impl->keys        ,sizeof(skc_ttxk_t)          * config->composition.keys.elem_count);
+  skc_extent_pdrw_alloc       (runtime,&impl->offsets     ,sizeof(skc_uint)            * (1u << SKC_TTCK_HI_BITS_YX)); // 1MB
+
+  // nothing saved
+  impl->saved.count = 0;
+
+  // unseal the composition, zero the atomics, etc.
+  skc_composition_unseal_block_reset(impl,false,true);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.h b/src/compute/skc/platforms/cl_12/composition_cl_12.h
new file mode 100644
index 0000000000..4f52090658
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/composition_cl_12.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <CL/opencl.h>
+
+#include "composition.h"
+#include "assert_state.h"
+#include "grid.h"
+#include "extent_cl_12.h"
+#include "extent_ring.h"
+
+//
+// composition states
+//
+
+typedef enum skc_composition_state_e {
+
+  SKC_COMPOSITION_STATE_UNSEALING,
+  SKC_COMPOSITION_STATE_UNSEALED,
+  SKC_COMPOSITION_STATE_SEALING,
+  SKC_COMPOSITION_STATE_SEALED
+
+} skc_composition_state_e;
+
+//
+// IMPL
+//
+
+struct skc_composition_impl
+{  
+  struct skc_composition        * composition;
+  struct skc_runtime            * runtime;
+  
+  SKC_ASSERT_STATE_DECLARE(skc_composition_state_e);
+
+  skc_int                         lock_count; // wip renders
+
+  struct { 
+    skc_grid_t                    sort;
+    skc_grid_t                    place;
+  } grids;
+
+  cl_command_queue                cq;
+
+  struct {
+    cl_kernel                     place;
+    cl_kernel                     segment;
+  } kernels;
+
+  // raster ids must be held until the composition is reset or
+  // released and then their refcounts can be decremented
+  struct {
+    struct skc_extent_phrw        extent;
+    skc_uint                      count;
+  } saved;
+
+  struct {
+    struct skc_extent_ring        ring;   // how many slots left?
+    struct skc_extent_phw1g_tdrNs extent; // wip command extent
+  } cmds;
+
+  // composition extent length
+  struct skc_extent_phr_pdrw      atomics;
+
+  // composition ttck extent
+  struct skc_extent_pdrw          keys;
+
+  // key offsets in sealed and sorted ttck extent
+  struct skc_extent_pdrw          offsets;
+};
+
+//
+// ATOMICS
+//
+
+struct skc_place_atomics
+{
+  skc_uint keys;
+  skc_uint offsets;
+};
+
+//
+// ONLY VISIBLE WITHIN THIS RUNTIME
+//
+
+void
+skc_composition_retain_and_lock(struct skc_composition * const composition);
+
+void
+skc_composition_unlock_and_release(struct skc_composition * const composition);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/config_cl.h b/src/compute/skc/platforms/cl_12/config_cl.h
new file mode 100644
index 0000000000..0172857b07
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/config_cl.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "runtime_cl.h"
+#include "block_pool_cl.h"
+
+//
+// FIXME -- define individual structs before defining skc_config
+//
+
+struct skc_config
+{
+  struct {
+    struct {
+      skc_uint               size;
+      skc_uint               subbufs;
+    } host;   // alignment determined by compiler
+    struct {
+      skc_uint               size;
+      skc_uint               subbufs; 
+    } device; // alignment determined by device
+  } suballocator;
+  
+  struct {
+    skc_uint                 size;
+  } scheduler;
+
+  struct {
+    skc_uint                 bytes;    // bytes per subblock -- pow2
+    skc_uint                 words;    // words per subblock -- pow2
+    // skc_uint              words_log2;
+  } subblock;
+
+  struct {
+    skc_uint                 bytes;     // bytes per block     -- pow2
+    skc_uint                 words;     // words per block     -- pow2
+    skc_uint                 subblocks; // subblocks per block -- block.bytes >= subblock.bytes
+    // skc_uint              subblocks_log2;
+  } block;
+
+  union skc_block_pool_size  block_pool;
+
+  struct {
+    skc_cq_type_e            type;
+    skc_uint                 size;
+  } cq_pool;
+
+  struct {
+    skc_uint                 size;      // a large fraction of block pool size
+    skc_uint                 width;     // determines number of launched reclamation subgroups
+    skc_uint                 recs;      // how many in-flight width-subgroup reclamation grids 
+  } handle_pool;
+
+  struct {
+    skc_uint                 width;     // tile width  in pixels
+    skc_uint                 height;    // tile height in pixels
+    skc_uint                 ratio;     // subblocks per TTPB
+  } tile;
+
+  struct {
+    struct {
+      skc_uint               count;     // # of subbufs in buffer
+    } buffer;
+
+    struct {
+      skc_uint               count;     // # of blocks/commands in subbuf
+    } subbuf;
+
+    struct {
+      size_t                 buffer;    // block.bytes * subbuf.blocks * subbuf.count
+      size_t                 subbuf;    // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+    } block;
+
+    struct {
+      size_t                 buffer;    // sizeof(skc_uint) * subbuf.blocks * subbuf.count
+      size_t                 subbuf;    // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+    } command;
+    //
+    // skc_uint              paths_lowat;
+    //
+  } paths_copy;
+
+  struct {
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } path_ids;
+
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } transforms;
+
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } clips;
+
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } fill;
+
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } raster_ids;
+
+    struct {
+      skc_uint               cmds;
+    } expand;
+
+    struct {
+      skc_uint               keys;
+    } rasterize;
+  } raster_cohort;
+
+  struct {
+    struct {
+      skc_uint               elem_count;
+      skc_uint               snap_count;
+    } cmds;
+
+    struct {
+      skc_uint               elem_count;
+    } raster_ids;
+
+    struct {
+      skc_uint               elem_count;
+    } keys;
+  } composition;
+};
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.c b/src/compute/skc/platforms/cl_12/cq_pool_cl.c
new file mode 100644
index 0000000000..80cfe34cf8
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+
+//
+//
+//
+
+#include <string.h>
+
+//
+//
+//
+
+#include "runtime_cl_12.h"
+
+//
+// This implementation is probably excessive.
+//
+// The command queue pool could easily be replaced with simply an LRU
+// or even round-robin reuse pool.  Even a small number of aliased
+// command queues can probably enough concurrency.
+//
+
+#define SKC_CQ_POOL_EXPAND 1
+
+//
+//
+//
+
+void
+skc_cq_pool_create(struct skc_runtime * const runtime,
+                   struct skc_cq_pool * const pool,
+                   skc_uint             const type,
+                   skc_uint             const size)
+{
+  pool->type   = type;
+  pool->size   = size + 1; // an empty spot
+  pool->reads  = 0;
+  pool->writes = size;
+  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
+
+  for (skc_uint ii=0; ii<size; ii++) {
+    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
+  }
+  pool->cq[size] = NULL;
+}
+
+//
+//
+//
+
+void
+skc_cq_pool_dispose(struct skc_runtime * const runtime,
+                    struct skc_cq_pool *       pool)
+{
+  //
+  // FIXME -- release the command queues after waiting for the ring to
+  // be full with pool.size queues?
+  //
+  skc_runtime_host_perm_free(runtime,pool->cq);
+}
+
+//
+//
+//
+
+static 
+void
+skc_cq_pool_write(struct skc_cq_pool * const pool,
+                  cl_command_queue           cq)
+{
+  pool->cq[pool->writes++ % pool->size] = cq;
+}
+
+//
+// only expand when completely empty
+//
+
+static
+void
+skc_cq_pool_expand(struct skc_runtime * const runtime,
+                   struct skc_cq_pool * const pool,
+                   skc_uint                   expand)
+{
+#ifndef NDEBUG
+  fprintf(stderr,"Expanding the cq_pool by: %u (%u)\n",expand,pool->size);
+#endif
+
+  // free old
+  skc_runtime_host_perm_free(runtime,pool->cq);
+
+  // the ring is empty
+  pool->size  += expand;
+  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
+  pool->reads  = 0;
+  pool->writes = expand;
+
+  for (skc_uint ii=0; ii<expand; ii++)
+    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
+}
+
+//
+//
+//
+
+static 
+cl_command_queue
+skc_cq_pool_read(struct skc_runtime * const runtime,
+                 struct skc_cq_pool * const pool)
+{
+  // any command queues left?
+  if (pool->reads == pool->writes)
+    skc_cq_pool_expand(runtime,pool,SKC_CQ_POOL_EXPAND);
+
+  cl_command_queue cq = pool->cq[pool->reads++ % pool->size];
+
+  return cq;
+}
+
+//
+//
+//
+
+cl_command_queue
+skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime)
+{
+  return skc_cq_pool_read(runtime,&runtime->cq_pool);
+}
+
+void
+skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, 
+                                cl_command_queue           cq)
+{
+  skc_cq_pool_write(&runtime->cq_pool,cq);
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.h b/src/compute/skc/platforms/cl_12/cq_pool_cl.h
new file mode 100644
index 0000000000..0cc73a2f82
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+#include "types.h"
+
+//
+// Why we need to wrap command queue creation:
+//
+// - command queue creation is expensive
+// 
+// - the CL 1.2 function is deprecated in 2.0
+//
+
+struct skc_cq_pool
+{
+  skc_cq_type_e      type;
+  skc_uint           size;
+  skc_uint           reads;
+  skc_uint           writes;
+  cl_command_queue * cq;
+};
+
+//l
+//
+//
+
+void
+skc_cq_pool_create(struct skc_runtime * const runtime,
+                   struct skc_cq_pool * const pool,
+                   skc_uint             const type,
+                   skc_uint             const size);
+
+void
+skc_cq_pool_dispose(struct skc_runtime * const runtime,
+                    struct skc_cq_pool *       pool);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/device_cl_12.h b/src/compute/skc/platforms/cl_12/device_cl_12.h
new file mode 100644
index 0000000000..637b61ae10
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/device_cl_12.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <CL/opencl.h>
+
+//
+//
+//
+
+#define SKC_CL_ARG(arg) sizeof(arg),&arg
+
+//
+//
+//
+
+typedef enum skc_device_kernel_id {
+  SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS,
+  SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS,
+
+  SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
+  SKC_DEVICE_KERNEL_ID_PATHS_COPY,
+
+  SKC_DEVICE_KERNEL_ID_FILLS_EXPAND,
+
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL,  
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES,
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS,
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS,
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS,
+  SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS,
+
+  SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK,
+  SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC,
+
+  SKC_DEVICE_KERNEL_ID_PREFIX,
+  SKC_DEVICE_KERNEL_ID_PLACE,
+  SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK,
+
+  SKC_DEVICE_KERNEL_ID_RENDER,
+
+  SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM,
+  SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM,
+
+  //
+  SKC_DEVICE_KERNEL_ID_COUNT
+
+} skc_device_kernel_id;
+
+//
+//
+//
+
+void
+skc_device_create(struct skc_runtime * const runtime);
+
+
+void
+skc_device_dispose(struct skc_runtime * const runtime);
+
+
+// 
+// multi-threading/context/device requires multiple kernel instances
+//
+
+cl_kernel
+skc_device_acquire_kernel(struct skc_device  * const device, 
+                          skc_device_kernel_id const type);
+
+//
+// grid shape can vary greatly by target platform
+//
+void
+skc_device_enqueue_kernel(struct skc_device  * const device, 
+                          skc_device_kernel_id const type,
+                          cl_command_queue           cq,
+                          cl_kernel                  kernel,
+                          size_t               const work_size,                          
+                          cl_uint                    num_events_in_wait_list,
+                          cl_event const     * const event_wait_list, 
+                          cl_event           * const event);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/export_cl_12.h b/src/compute/skc/platforms/cl_12/export_cl_12.h
new file mode 100644
index 0000000000..e577282791
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/export_cl_12.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "skc.h"
+
+//
+// 
+//
+
+skc_err
+skc_path_builder_cl_12_create(struct skc_context        * const context,
+                              struct skc_path_builder * * const path_builder);
+
+//
+// 
+//
+
+skc_err
+skc_raster_builder_cl_12_create(struct skc_context          * const context,
+                                struct skc_raster_builder * * const raster_builder);
+
+//
+//
+//
+
+skc_err
+skc_composition_cl_12_create(struct skc_context       * const context,
+                             struct skc_composition * * const composition);
+
+//
+//
+//
+
+skc_err
+skc_styling_cl_12_create(struct skc_context   * const context,
+                         struct skc_styling * * const styling,
+                         uint32_t               const layers_count,
+                         uint32_t               const groups_count,
+                         uint32_t               const extras_count);
+
+//
+//
+//
+
+skc_err
+skc_surface_cl_12_create(struct skc_context   * const context,
+                         struct skc_surface * * const surface);
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.c b/src/compute/skc/platforms/cl_12/extent_cl_12.c
new file mode 100644
index 0000000000..73676d8063
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/extent_cl_12.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stdlib.h>
+
+#include "common/cl/assert_cl.h"
+#include "extent_cl_12.h"
+#include "runtime_cl_12.h"
+
+//
+// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY
+//
+
+void
+skc_extent_phrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_phrw * const extent,
+                      size_t                   const size)
+{
+  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size);
+}
+
+void
+skc_extent_phrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_phrw * const extent)
+{
+  skc_runtime_host_perm_free(runtime,extent->hrw);
+}
+
+//
+// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP
+//
+
+void
+skc_extent_pdrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_pdrw * const extent,
+                      size_t                   const size)
+{
+  extent->drw = skc_runtime_device_perm_alloc(runtime,
+                                              CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+                                              size);
+}
+
+void
+skc_extent_pdrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_pdrw * const extent)
+{
+  skc_runtime_device_perm_free(runtime,extent->drw);
+}
+
+//
+// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING
+//
+
+void
+skc_extent_tdrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_tdrw * const extent,
+                      size_t                   const size)
+{
+  extent->size = size;
+  extent->drw  = skc_runtime_device_temp_alloc(runtime,
+                                               CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+                                               size,&extent->id,NULL);
+}
+
+void
+skc_extent_tdrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_tdrw * const extent)
+{
+  skc_runtime_device_temp_free(runtime,extent->drw,extent->id);
+}
+
+void
+skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent,
+                     cl_command_queue         const cq,
+                     cl_event               * const event)
+{
+  if (extent->size == 0)
+    return;
+
+  skc_uint const zero = 0;
+
+  cl(EnqueueFillBuffer(cq,
+                       extent->drw,
+                       &zero,
+                       sizeof(zero),
+                       0,
+                       extent->size,
+                       0,NULL,event));
+}
+
+//
+// DURABLE SMALL EXTENTS BACKING ATOMICS
+//
+
+void
+skc_extent_phr_pdrw_alloc(struct skc_runtime         * const runtime,
+                          struct skc_extent_phr_pdrw * const extent,
+                          size_t                       const size)
+{
+  extent->size = size;
+  extent->hr   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_ONLY,size);
+  extent->drw  = skc_runtime_device_perm_alloc(runtime,CL_MEM_READ_WRITE,size);
+}
+
+void
+skc_extent_phr_pdrw_free(struct skc_runtime         * const runtime,
+                         struct skc_extent_phr_pdrw * const extent)
+{
+  skc_runtime_host_perm_free(runtime,extent->hr);
+  skc_runtime_device_perm_free(runtime,extent->drw);
+}
+
+void
+skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event)
+{
+  if (extent->size == 0)
+    return;
+
+  cl(EnqueueReadBuffer(cq, 
+                       extent->drw,
+                       CL_FALSE,
+                       0,
+                       extent->size,
+                       extent->hr,
+                       0,NULL,event));
+}
+
+void
+skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event)
+{
+  if (extent->size == 0)
+    return;
+
+  skc_uint const zero = 0;
+
+  cl(EnqueueFillBuffer(cq,
+                       extent->drw,
+                       &zero,
+                       sizeof(zero),
+                       0,
+                       extent->size,
+                       0,NULL,event));
+}
+
+//
+// EPHEMERAL SMALL EXTENTS BACKING ATOMICS
+//
+
+void
+skc_extent_thr_tdrw_alloc(struct skc_runtime         * const runtime,
+                          struct skc_extent_thr_tdrw * const extent,
+                          size_t                       const size)
+{
+  extent->size = size;
+  extent->hr   = skc_runtime_host_temp_alloc(runtime,
+                                             SKC_MEM_FLAGS_READ_WRITE,
+                                             size,&extent->id.hr,NULL);
+  extent->drw  = skc_runtime_device_temp_alloc(runtime,
+                                               CL_MEM_READ_WRITE,
+                                               size,
+                                               &extent->id.drw,
+                                               NULL);
+}
+
+void
+skc_extent_thr_tdrw_free(struct skc_runtime         * const runtime,
+                         struct skc_extent_thr_tdrw * const extent)
+{
+  skc_runtime_host_temp_free(runtime,extent->hr,extent->id.hr);
+  skc_runtime_device_temp_free(runtime,extent->drw,extent->id.drw);
+}
+
+void
+skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event)
+{
+  if (extent->size == 0)
+    return;
+
+  cl(EnqueueReadBuffer(cq, 
+                       extent->drw,
+                       CL_FALSE,
+                       0,
+                       extent->size,
+                       extent->hr,
+                       0,NULL,event));
+}
+
+void
+skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event)
+{
+  if (extent->size == 0)
+    return;
+
+  skc_uint const zero = 0;
+
+  cl(EnqueueFillBuffer(cq,
+                       extent->drw,
+                       &zero,
+                       sizeof(zero),
+                       0,
+                       extent->size,
+                       0,NULL,event));
+}
+
+//
+// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
+//
+
+void
+skc_extent_phw1g_tdrNs_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phw1g_tdrNs * const extent,
+                             size_t                          const size)
+{
+  extent->hw1 = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_WRITE_ONLY,size);
+}
+
+void
+skc_extent_phw1g_tdrNs_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phw1g_tdrNs * const extent)
+{
+  skc_runtime_host_perm_free(runtime,extent->hw1);
+}
+
+void
+skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phw1g_tdrNs_snap * const snap)
+{
+  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
+}
+
+void
+skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phw1g_tdrNs      * const extent,
+                                  struct skc_extent_phw1g_tdrNs_snap * const snap,
+                                  cl_command_queue                     const cq,
+                                  cl_event                           * const event)
+{
+  struct skc_extent_ring const * const ring = snap->snap->ring;
+
+  skc_uint const count = skc_extent_ring_snap_count(snap->snap);
+  size_t   const size  = count * ring->size.elem;
+
+  snap->drN = skc_runtime_device_temp_alloc(runtime,
+                                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+                                            size,&snap->id,NULL);
+
+  if (count == 0)
+    return;
+
+  // possibly two copies
+  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
+  skc_uint const count_max = ring->size.pow2 - index_lo;
+  skc_uint const count_lo  = min(count_max,count);
+  size_t   const bytes_lo  = count_lo * ring->size.elem;
+
+  if (count > count_max)
+    {
+      skc_uint const bytes_hi = (count - count_max) * ring->size.elem;
+
+      cl(EnqueueWriteBuffer(cq,
+                            snap->drN,
+                            CL_FALSE,
+                            bytes_lo,
+                            bytes_hi,
+                            extent->hw1, // offset_hi = 0
+                            0,NULL,NULL));
+    }
+
+  size_t const offset_lo = index_lo * ring->size.elem;
+
+  cl(EnqueueWriteBuffer(cq,
+                        snap->drN,
+                        CL_FALSE,
+                        0,
+                        bytes_lo,
+                        (skc_uchar*)extent->hw1 + offset_lo,
+                        0,NULL,event));
+
+}
+
+void
+skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phw1g_tdrNs_snap * const snap)
+{
+  skc_runtime_device_temp_free(runtime,snap->drN,snap->id);
+  skc_extent_ring_snap_free(runtime,snap->snap);
+}
+
+//
+// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
+//
+
+void
+skc_extent_phrwg_tdrNs_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phrwg_tdrNs * const extent,
+                             size_t                          const size)
+{
+  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE
+}
+
+void
+skc_extent_phrwg_tdrNs_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phrwg_tdrNs * const extent)
+{
+  skc_runtime_host_perm_free(runtime,extent->hrw);
+}
+
+void
+skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phrwg_tdrNs_snap * const snap)
+{
+  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
+}
+
+void
+skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phrwg_tdrNs      * const extent,
+                                  struct skc_extent_phrwg_tdrNs_snap * const snap,
+                                  cl_command_queue                     const cq,
+                                  cl_event                           * const event)
+{
+  struct skc_extent_ring const * const ring = snap->snap->ring;
+
+  skc_uint const count = skc_extent_ring_snap_count(snap->snap);
+  size_t   const size  = count * ring->size.elem;
+
+  snap->drN = skc_runtime_device_temp_alloc(runtime,
+                                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+                                            size,&snap->id,NULL);
+
+  if (count == 0)
+    return;
+
+  // possibly two copies
+  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
+  skc_uint const count_max = ring->size.pow2 - index_lo;
+  skc_uint const count_lo  = min(count_max,count);
+  size_t   const bytes_lo  = count_lo * ring->size.elem;
+
+  if (count > count_max)
+    {
+      skc_uint const count_hi = count - count_max;
+      skc_uint const bytes_hi = count_hi * ring->size.elem;
+
+      cl(EnqueueWriteBuffer(cq,
+                            snap->drN,
+                            CL_FALSE,
+                            bytes_lo,
+                            bytes_hi,
+                            extent->hrw, // offset_hi = 0
+                            0,NULL,NULL));
+    }
+
+  size_t offset_lo = index_lo * ring->size.elem;
+
+  cl(EnqueueWriteBuffer(cq,
+                        snap->drN,
+                        CL_FALSE,
+                        0,
+                        bytes_lo,
+                        (skc_uchar*)extent->hrw + offset_lo,
+                        0,NULL,event));
+
+}
+
+void
+skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phrwg_tdrNs_snap * const snap)
+{
+  skc_runtime_device_temp_free(runtime,snap->drN,snap->id);
+  skc_extent_ring_snap_free(runtime,snap->snap);
+}
+
+//
+// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT
+//
+// Note that because the ring and snapshot are both in host memory and
+// the snapshot blocks progress until freed we can simply point the
+// fake ephemeral snapshot at the ring's durable extent.
+//
+
+void
+skc_extent_phrwg_thr1s_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phrwg_thr1s * const extent,
+                             size_t                          const size)
+{
+  extent->hrw = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,size); // WRITE-ONCE
+}
+
+void
+skc_extent_phrwg_thr1s_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phrwg_thr1s * const extent)
+{
+  skc_runtime_host_perm_free(runtime,extent->hrw);
+}
+
+void
+skc_extent_phrwg_thr1s_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phrwg_thr1s_snap * const snap)
+{
+  snap->snap = skc_extent_ring_snap_alloc(runtime,ring);
+}
+
+void
+skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phrwg_thr1s      * const extent,
+                                  struct skc_extent_phrwg_thr1s_snap * const snap)
+{
+  struct skc_extent_ring const * const ring = snap->snap->ring;
+
+  skc_uint const count     = skc_extent_ring_snap_count(snap->snap);
+  skc_uint const index_lo  = snap->snap->reads & ring->size.mask;
+  skc_uint const count_max = ring->size.pow2 - index_lo;
+
+  snap->count.lo = min(count_max,count);
+  snap->hr1.lo   = (skc_uchar*)extent->hrw + (index_lo * ring->size.elem);
+
+  if (count > count_max)
+    {
+      snap->count.hi = count - count_max;
+      snap->hr1.hi   = extent->hrw;
+    }
+  else
+    {
+      snap->count.hi = 0;
+      snap->hr1.hi   = NULL;
+    }
+}
+
+void
+skc_extent_phrwg_thr1s_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phrwg_thr1s_snap * const snap)
+{
+  skc_extent_ring_snap_free(runtime,snap->snap);
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.h b/src/compute/skc/platforms/cl_12/extent_cl_12.h
new file mode 100644
index 0000000000..47ba951bb3
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/extent_cl_12.h
@@ -0,0 +1,476 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <CL/opencl.h>
+
+#include "suballocator.h"
+#include "extent_ring.h"
+
+//
+// Legend:
+//
+//   p  :  durable
+//   t  :  ephemeral
+//   h  :  host
+//   d  :  device
+//   r  :  read
+//   w  :  write
+//   1  :  once -- e.g. w1 is 'write-once'
+//   N  :  many -- e.g. rN is 'read-many'
+//   g  :  ring
+//   s  :  ring snapshot
+//
+// Notes:
+//
+//   rw :  for now, read-write implies read-write many
+//
+
+//
+// DURABLE R/W HOST EXTENT -- STANDARD CACHED MEMORY
+//
+
+struct skc_extent_phrw 
+{
+  void * hrw;
+};
+
+void
+skc_extent_phrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_phrw * const extent,
+                      size_t                   const size);
+
+void
+skc_extent_phrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_phrw * const extent);
+
+//
+// DURABLE R/W DEVICE EXTENT -- ALLOCATED FROM DEVICE HEAP
+//
+
+struct skc_extent_pdrw 
+{
+  cl_mem drw;
+};
+
+void
+skc_extent_pdrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_pdrw * const extent,
+                      size_t                   const size);
+
+void
+skc_extent_pdrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_pdrw * const extent);
+
+//
+// EPHEMERAL DEVICE R/W EXTENT -- ALLOCATED QUICKLY FROM A MANAGED RING
+//
+
+struct skc_extent_tdrw 
+{
+  size_t          size;
+  cl_mem          drw;
+  skc_subbuf_id_t id;
+};
+
+void
+skc_extent_tdrw_alloc(struct skc_runtime     * const runtime,
+                      struct skc_extent_tdrw * const extent,
+                      size_t                   const size);
+
+void
+skc_extent_tdrw_free(struct skc_runtime     * const runtime,
+                     struct skc_extent_tdrw * const extent);
+
+void
+skc_extent_tdrw_zero(struct skc_extent_tdrw * const extent,
+                     cl_command_queue         const cq,
+                     cl_event               * const event);
+
+//
+// DURABLE SMALL EXTENTS BACKING ATOMICS
+//
+
+struct skc_extent_phr_pdrw
+{
+  size_t size; // must be multiple of words
+  void * hr;
+  cl_mem drw;
+};
+
+void
+skc_extent_phr_pdrw_alloc(struct skc_runtime         * const runtime,
+                          struct skc_extent_phr_pdrw * const extent,
+                          size_t                       const size);
+
+void
+skc_extent_phr_pdrw_free(struct skc_runtime         * const runtime,
+                         struct skc_extent_phr_pdrw * const extent);
+
+void
+skc_extent_phr_pdrw_read(struct skc_extent_phr_pdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event);
+
+void
+skc_extent_phr_pdrw_zero(struct skc_extent_phr_pdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event);
+
+//
+// EPHEMERAL SMALL EXTENTS BACKING ATOMICS
+//
+
+struct skc_extent_thr_tdrw
+{
+  size_t            size; // must be multiple of words
+
+  void            * hr;
+  cl_mem            drw;
+
+  struct {
+    skc_subbuf_id_t hr;
+    skc_subbuf_id_t drw;
+  } id;
+};
+
+void
+skc_extent_thr_tdrw_alloc(struct skc_runtime         * const runtime,
+                          struct skc_extent_thr_tdrw * const extent,
+                          size_t                       const size);
+
+void
+skc_extent_thr_tdrw_free(struct skc_runtime         * const runtime,
+                         struct skc_extent_thr_tdrw * const extent);
+
+void
+skc_extent_thr_tdrw_read(struct skc_extent_thr_tdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event);
+
+void
+skc_extent_thr_tdrw_zero(struct skc_extent_thr_tdrw * const extent,
+                         cl_command_queue             const cq,
+                         cl_event                   * const event);
+
+//
+// DURABLE W/1 HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
+//
+
+struct skc_extent_phw1g_tdrNs
+{
+  void * hw1;
+};
+
+struct skc_extent_phw1g_tdrNs_snap
+{
+  struct skc_extent_ring_snap * snap;
+  cl_mem                        drN;
+  skc_subbuf_id_t               id;
+};
+
+void
+skc_extent_phw1g_tdrNs_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phw1g_tdrNs * const extent,
+                             size_t                          const size);
+
+void
+skc_extent_phw1g_tdrNs_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phw1g_tdrNs * const extent);
+
+void
+skc_extent_phw1g_tdrNs_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phw1g_tdrNs_snap * const snap);
+
+void
+skc_extent_phw1g_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phw1g_tdrNs      * const extent,
+                                  struct skc_extent_phw1g_tdrNs_snap * const snap,
+                                  cl_command_queue                     const cq,
+                                  cl_event                           * const event);
+
+void
+skc_extent_phw1g_tdrNs_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phw1g_tdrNs_snap * const snap);
+
+//
+// DURABLE R/W HOST RING WITH AN EPHEMERAL R/N DEVICE SNAPSHOT
+//
+
+struct skc_extent_phrwg_tdrNs
+{
+  void * hrw;
+};
+
+struct skc_extent_phrwg_tdrNs_snap
+{
+  struct skc_extent_ring_snap * snap;
+  cl_mem                        drN;
+  skc_subbuf_id_t               id;
+};
+
+void
+skc_extent_phrwg_tdrNs_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phrwg_tdrNs * const extent,
+                             size_t                          const size);
+
+void
+skc_extent_phrwg_tdrNs_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phrwg_tdrNs * const extent);
+
+void
+skc_extent_phrwg_tdrNs_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phrwg_tdrNs_snap * const snap);
+
+void
+skc_extent_phrwg_tdrNs_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phrwg_tdrNs      * const extent,
+                                  struct skc_extent_phrwg_tdrNs_snap * const snap,
+                                  cl_command_queue                     const cq,
+                                  cl_event                           * const event);
+
+void
+skc_extent_phrwg_tdrNs_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phrwg_tdrNs_snap * const snap);
+
+//
+// DURABLE HOST R/W RING WITH AN EPHEMERAL HOST R/1 SNAPSHOT
+//
+// Note that because the ring and snapshot are both in host memory and
+// the snapshot blocks progress until freed we can simply point the
+// fake ephemeral snapshot at the ring's durable extent.
+//
+
+struct skc_extent_phrwg_thr1s
+{
+  void * hrw;
+};
+
+struct skc_extent_phrwg_thr1s_snap
+{
+  struct skc_extent_ring_snap * snap;
+
+  struct {
+    skc_uint                    lo;
+    skc_uint                    hi;
+  } count;
+
+  struct {
+    void                      * lo;
+    void                      * hi;
+  } hr1;
+};
+
+void
+skc_extent_phrwg_thr1s_alloc(struct skc_runtime            * const runtime,
+                             struct skc_extent_phrwg_thr1s * const extent,
+                             size_t                          const size);
+
+void
+skc_extent_phrwg_thr1s_free(struct skc_runtime            * const runtime,
+                            struct skc_extent_phrwg_thr1s * const extent);
+
+void
+skc_extent_phrwg_thr1s_snap_init(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_ring             * const ring,
+                                 struct skc_extent_phrwg_thr1s_snap * const snap);
+
+void
+skc_extent_phrwg_thr1s_snap_alloc(struct skc_runtime                 * const runtime,
+                                  struct skc_extent_phrwg_thr1s      * const extent,
+                                  struct skc_extent_phrwg_thr1s_snap * const snap);
+
+void
+skc_extent_phrwg_thr1s_snap_free(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phrwg_thr1s_snap * const snap);
+
+//
+// EPHEMERAL MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+#if 0
+struct skc_extent_thrw_tdrw
+{
+  size_t          size;
+  cl_mem          drw;
+  skc_subbuf_id_t id;
+};
+
+void
+skc_extent_thrw_tdrw_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_thrw_tdrw * const extent,
+                           size_t                        const size);
+
+void
+skc_extent_thrw_tdrw_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_thrw_tdrw * const extent);
+
+void *
+skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event);
+
+void *
+skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event);
+
+void
+skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent,
+                           void                        * const hrN,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event);
+#endif
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+struct skc_extent_phrw_pdrw
+{
+  size_t size;
+  cl_mem drw;
+};
+
+void
+skc_extent_phrw_pdrw_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phrw_pdrw * const extent,
+                           size_t                        const size);
+
+void
+skc_extent_phrw_pdrw_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phrw_pdrw * const extent);
+
+void *
+skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event);
+
+void *
+skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event);
+
+void
+skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent,
+                           void                        * const hrN,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event);
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/O   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+struct skc_extent_phrN_pdwN
+{
+  size_t size;
+  cl_mem dwN;
+};
+
+void
+skc_extent_phrN_pdwN_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phrN_pdwN * const extent,
+                           size_t                        const size);
+
+void
+skc_extent_phrN_pdwN_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phrN_pdwN * const extent);
+
+void *
+skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event);
+
+void *
+skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event);
+
+void
+skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent,
+                           void                        * const hrN,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event);
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO W/O   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+struct skc_extent_phwN_pdrN
+{
+  size_t size;
+  cl_mem drN;
+};
+
+void
+skc_extent_phwN_pdrN_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phwN_pdrN * const extent,
+                           size_t                        const size);
+
+void
+skc_extent_phwN_pdrN_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phwN_pdrN * const extent);
+
+void *
+skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event);
+
+void *
+skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event);
+
+void
+skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent,
+                           void                        * const hwm,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c
new file mode 100644
index 0000000000..69c669ad54
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/extent_cl_12_unified.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT NONE OF THESE EXTENTS CHECK FOR ZERO-SIZED ALLOCATIONS.
+// THAT'S OK FOR NOW.
+//
+
+#include <stdlib.h>
+
+#include "runtime_cl_12.h"
+#include "extent_cl_12.h"
+#include "common/cl/assert_cl.h"
+
+//
+// EPHEMERAL MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+#if 0
+
+#pragma message("struct skc_extent_thrw_tdrw will be removed once the sorter is installed.")
+
+void
+skc_extent_thrw_tdrw_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_thrw_tdrw * const extent,
+                           size_t                        const size)
+{
+  extent->drw = skc_runtime_device_temp_alloc(runtime,
+                                              CL_MEM_READ_WRITE /* | CL_MEM_ALLOC_HOST_PTR */,
+                                              size,&extent->id,&extent->size);
+}
+
+void
+skc_extent_thrw_tdrw_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_thrw_tdrw * const extent)
+{
+  skc_runtime_device_temp_free(runtime,extent->drw,extent->id);
+}
+
+void *
+skc_extent_thrw_tdrw_map_size(struct skc_extent_thrw_tdrw * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event)
+{
+  cl_int cl_err;
+
+  void * hrw = clEnqueueMapBuffer(cq,extent->drw,
+                                  CL_FALSE,
+                                  CL_MAP_READ | CL_MAP_WRITE,0,size,
+                                  0,NULL,event,&cl_err); cl_ok(cl_err);
+
+  return hrw;
+}
+
+void *
+skc_extent_thrw_tdrw_map(struct skc_extent_thrw_tdrw * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event)
+{
+  return skc_extent_thrw_tdrw_map_size(extent,extent->size,cq,event);
+}
+
+void
+skc_extent_thrw_tdrw_unmap(struct skc_extent_thrw_tdrw * const extent,
+                           void                        * const hrw,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event)
+{
+  cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event));
+}
+
+#endif
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/W   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/W DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+void
+skc_extent_phrw_pdrw_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phrw_pdrw * const extent,
+                           size_t                        const size)
+{
+  cl_int cl_err;
+
+  extent->size = size;
+  extent->drw  = clCreateBuffer(runtime->cl.context,
+                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                size,NULL,&cl_err); cl_ok(cl_err);
+}
+
+void
+skc_extent_phrw_pdrw_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phrw_pdrw * const extent)
+{
+  cl(ReleaseMemObject(extent->drw));
+}
+
+void *
+skc_extent_phrw_pdrw_map_size(struct skc_extent_phrw_pdrw * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event)
+{
+  cl_int cl_err;
+
+  void * hrw = clEnqueueMapBuffer(cq,extent->drw,
+                                  CL_FALSE,
+                                  CL_MAP_READ | CL_MAP_WRITE,0,size,
+                                  0,NULL,event,&cl_err); cl_ok(cl_err);
+
+  return hrw;
+}
+
+void *
+skc_extent_phrw_pdrw_map(struct skc_extent_phrw_pdrw * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event)
+{
+  return skc_extent_phrw_pdrw_map_size(extent,extent->size,cq,event);
+}
+
+void
+skc_extent_phrw_pdrw_unmap(struct skc_extent_phrw_pdrw * const extent,
+                           void                        * const hrw,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event)
+{
+  cl(EnqueueUnmapMemObject(cq,extent->drw,hrw,0,NULL,event));
+}
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO R/O   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO W/O DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+void
+skc_extent_phrN_pdwN_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phrN_pdwN * const extent,
+                           size_t                        const size)
+{
+  cl_int cl_err;
+
+  extent->size = size;
+  extent->dwN  = clCreateBuffer(runtime->cl.context,
+                                CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                size,NULL,&cl_err); cl_ok(cl_err);
+}
+
+void
+skc_extent_phrN_pdwN_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phrN_pdwN * const extent)
+{
+  cl(ReleaseMemObject(extent->dwN));
+}
+
+void *
+skc_extent_phrN_pdwN_map_size(struct skc_extent_phrN_pdwN * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event)
+{
+  cl_int cl_err;
+
+  void * hrN = clEnqueueMapBuffer(cq,extent->dwN,
+                                  CL_FALSE,
+                                  CL_MAP_READ,0,size,
+                                  0,NULL,event,&cl_err); cl_ok(cl_err);
+
+  return hrN;
+}
+
+void *
+skc_extent_phrN_pdwN_map(struct skc_extent_phrN_pdwN * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event)
+{
+  return skc_extent_phrN_pdwN_map_size(extent,extent->size,cq,event);
+}
+
+void
+skc_extent_phrN_pdwN_unmap(struct skc_extent_phrN_pdwN * const extent,
+                           void                        * const hrN,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event)
+{
+  cl(EnqueueUnmapMemObject(cq,extent->dwN,hrN,0,NULL,event));
+}
+
+//
+// DURABLE MAPPING
+//
+// ENTIRE EXTENT   MAPPED TO W/O   HOST MEMORY
+// ENTIRE EXTENT UNMAPPED TO R/O DEVICE MEMORY
+//
+// Note: integrated vs. discrete GPUs will have different
+// implementations because we don't want a GPU kernel repeatedly
+// accessing pinned memory.
+//
+
+void
+skc_extent_phwN_pdrN_alloc(struct skc_runtime          * const runtime,
+                           struct skc_extent_phwN_pdrN * const extent,
+                           size_t                        const size)
+{
+  cl_int cl_err;
+
+  extent->size = size;
+  extent->drN  = clCreateBuffer(runtime->cl.context,
+                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                size,NULL,&cl_err); cl_ok(cl_err);
+}
+
+void
+skc_extent_phwN_pdrN_free(struct skc_runtime          * const runtime,
+                          struct skc_extent_phwN_pdrN * const extent)
+{
+  cl(ReleaseMemObject(extent->drN));
+}
+
+void *
+skc_extent_phwN_pdrN_map_size(struct skc_extent_phwN_pdrN * const extent,
+                              size_t                        const size,
+                              cl_command_queue              const cq,
+                              cl_event                    * const event)
+{
+  cl_int cl_err;
+
+  void * hwN = clEnqueueMapBuffer(cq,extent->drN,
+                                  CL_FALSE,
+                                  CL_MAP_WRITE,0,size,
+                                  0,NULL,event,&cl_err); cl_ok(cl_err);
+
+  return hwN;
+}
+
+void *
+skc_extent_phwN_pdrN_map(struct skc_extent_phwN_pdrN * const extent,
+                         cl_command_queue              const cq,
+                         cl_event                    * const event)
+{
+  return skc_extent_phwN_pdrN_map_size(extent,extent->size,cq,event);
+}
+
+void
+skc_extent_phwN_pdrN_unmap(struct skc_extent_phwN_pdrN * const extent,
+                           void                        * const hwN,
+                           cl_command_queue              const cq,
+                           cl_event                    * const event)
+{
+  cl(EnqueueUnmapMemObject(cq,extent->drN,hwN,0,NULL,event));
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/gl/interop.c b/src/compute/skc/platforms/cl_12/gl/interop.c
new file mode 100644
index 0000000000..6697bb7e83
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/gl/interop.c
@@ -0,0 +1,629 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <glad/glad.h>
+#include <glfw/glfw3.h>
+
+//
+//
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+//
+//
+//
+
+#include "common/cl/assert_cl.h"
+#include "types.h"
+
+//
+//
+//
+
+#include "interop.h"
+#include "context.h"
+#include "runtime_cl_12.h"
+
+//
+//
+//
+
+#include "svg2skc/transform_stack.h"
+
+//
+//
+//
+
+#if 1
+#define SKC_IMAGE_FORMAT GL_RGBA8
+#else
+#define SKC_IMAGE_FORMAT GL_RGBA16F
+#endif
+
+//
+//
+//
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+//
+//
+//
+
+struct skc_interop_fb
+{
+  cl_context context;
+
+  GLuint     fbo;
+  GLuint     rbo;
+
+  cl_mem     mem;
+
+  int        width;
+  int        height;
+
+  bool       is_srgb;
+  bool       is_vsync_on;
+  bool       is_fullscreen;
+  bool       is_iconified;
+  bool       is_resized;
+  bool       is_spinning;
+  bool       is_info;
+
+  skc_float  scale;
+  skc_float2 translate;
+  float      rotate_theta;
+};
+
+static struct skc_interop_fb fb =
+  {
+    .mem           = NULL,
+
+    .is_srgb       = true,
+    .is_vsync_on   = false,
+    .is_fullscreen = false,
+    .is_iconified  = false,
+    .is_resized    = true,
+    .is_spinning   = false,
+    .is_info       = false,
+
+    .scale         = 1.0f,
+    .translate     = { 0.0f, 0.0f },
+    .rotate_theta  = 0.0f
+  };
+
+//
+// FPS COUNTER FROM HERE:
+//
+// http://antongerdelan.net/opengl/glcontext2.html
+//
+
+static
+void
+skc_interop_fps(GLFWwindow * window)
+{
+  if (fb.is_fullscreen)
+    return;
+
+  // static fps counters
+  static double stamp_prev  = 0.0;
+  static int    frame_count = 0;
+
+  // locals
+  double const  stamp_curr  = glfwGetTime();
+  double const  elapsed     = stamp_curr - stamp_prev;
+
+  if (elapsed >= 0.5)
+    {
+      stamp_prev = stamp_curr;
+
+      double const fps = (double)frame_count / elapsed;
+
+      char tmp[64];
+
+      sprintf_s(tmp,64,"(%d x %d) - VSync %s - sRGB %s - FPS: %.2f",
+                fb.width,fb.height,
+                fb.is_vsync_on ? "ON"      : "OFF",
+                fb.is_srgb     ? "ENABLED" : "DISABLED",
+                fps);
+
+      glfwSetWindowTitle(window,tmp);
+
+      frame_count = 0;
+    }
+
+  frame_count++;
+}
+
+//
+// INITIALIZE GLFW/GLAD
+//
+
+static
+void
+skc_interop_error_callback(int error, char const * description)
+{
+  fputs(description,stderr);
+}
+
+//
+//
+//
+
+static
+void
+skc_interop_iconify_callback(GLFWwindow * window, int iconified)
+{
+  fb.is_iconified = iconified;
+}
+
+//
+//
+//
+
+static
+void
+skc_interop_key_callback(GLFWwindow * window, int key, int scancode, int action, int mods)
+{
+  if (action == GLFW_RELEASE)
+    return;
+
+  switch (key)
+    {
+    case GLFW_KEY_EQUAL:
+      fb.rotate_theta = 0.0f;
+      break;
+
+    case GLFW_KEY_I:
+      fb.is_info = true;
+      break;
+
+    case GLFW_KEY_R:
+      fb.is_spinning ^= true;
+      break;
+
+    case GLFW_KEY_S:
+      fb.is_srgb ^= true;
+      if (fb.is_srgb)
+        glEnable(GL_FRAMEBUFFER_SRGB);
+      else
+        glDisable(GL_FRAMEBUFFER_SRGB);
+      break;
+
+    case GLFW_KEY_V:
+      fb.is_vsync_on ^= true;
+      glfwSwapInterval(fb.is_vsync_on ? 1 : 0);
+      break;
+
+    case GLFW_KEY_W:
+      glfwSetWindowSize(window,1024,1024);
+      break;
+
+    case GLFW_KEY_ESCAPE:
+      glfwSetWindowShouldClose(window,GL_TRUE);
+      break;
+    }
+}
+
+static
+void
+skc_interop_window_size_callback(GLFWwindow * window, int width, int height)
+{
+  fb.width      = width;
+  fb.height     = height;
+  fb.is_resized = true;
+
+#if 0
+  skc_render_kernel_set_clip(0,0,width,height);
+#endif
+}
+
+static
+void
+skc_interop_scale(double const scale_offset)
+{
+#define SKC_SCALE_FACTOR 1.05
+
+  static double scale_exp = 0.0;
+
+  scale_exp += scale_offset;
+  fb.scale   = (float)pow(SKC_SCALE_FACTOR,scale_exp);
+}
+
+static
+void
+skc_interop_scroll_callback(GLFWwindow * window, double xoffset, double yoffset)
+{
+  bool const ctrl =
+    (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL)  == GLFW_PRESS) ||
+    (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS);
+
+  if (!ctrl)
+    return;
+
+  skc_interop_scale(yoffset);
+}
+
+static
+void
+skc_interop_translate(float const dx, float const dy)
+{
+  float const dx_scaled = dx / fb.scale;
+  float const dy_scaled = dy / fb.scale;
+
+  float const cos_theta = cosf(fb.rotate_theta); // replace with cospi if available
+  float const sin_theta = sinf(fb.rotate_theta); // replace with sinpi if available
+
+  fb.translate.x += dx_scaled*cos_theta + dy_scaled*sin_theta;
+  fb.translate.y += dy_scaled*cos_theta - dx_scaled*sin_theta;
+}
+
+static
+void
+skc_interop_cursor_position_callback(GLFWwindow * window, double x, double y)
+{
+  int const state = glfwGetMouseButton(window,GLFW_MOUSE_BUTTON_LEFT);
+
+  static bool  is_mouse_dragging = false;
+  static float x_prev=0.0, y_prev=0.0;
+
+  float const mx = (float)x;
+  float const my = (float)y;
+
+  if (state == GLFW_PRESS)
+    {
+      if (is_mouse_dragging)
+        {
+          const bool ctrl =
+            (glfwGetKey(window,GLFW_KEY_LEFT_CONTROL)  == GLFW_PRESS) ||
+            (glfwGetKey(window,GLFW_KEY_RIGHT_CONTROL) == GLFW_PRESS);
+
+          if (ctrl)
+            {
+              float const cx  = 0.5f * fb.width;
+              float const cy  = 0.5f * fb.height;
+
+              // find angle between mouse and center
+              float const vx  = x_prev - cx;
+              float const vy  = y_prev - cy;
+
+              float const wx  = mx - cx;
+              float const wy  = my - cy;
+
+              float const len = sqrtf((vx*vx + vy*vy) * (wx*wx + wy*wy));
+
+              if (len > 0.0f)
+                {
+                  float const dot = vx*wx + vy*wy;
+                  float const da  = acosf(dot / len);
+
+                  if (vx*wy - vy*wx >= 0.0f)
+                    fb.rotate_theta += da;
+                  else
+                    fb.rotate_theta -= da;
+
+                  fb.rotate_theta = fmodf(fb.rotate_theta,(float)(M_PI*2.0));
+                }
+            }
+          else
+            {
+              skc_interop_translate(mx - x_prev,
+                                    my - y_prev);
+            }
+        }
+      else
+        {
+          is_mouse_dragging = true;
+        }
+
+      x_prev = mx;
+      y_prev = my;
+    }
+  else
+    {
+      is_mouse_dragging = false;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_interop_resize()
+{
+  fb.is_resized = false;
+
+  // release the image2d
+  if (fb.mem != NULL)
+    cl(ReleaseMemObject(fb.mem));
+
+  // resize rbo
+  glNamedRenderbufferStorage(fb.rbo,
+                             SKC_IMAGE_FORMAT,
+                             fb.width,
+                             fb.height);
+
+  // attach rbo to fbo
+  glNamedFramebufferRenderbuffer(fb.fbo,
+                                 GL_COLOR_ATTACHMENT0,
+                                 GL_RENDERBUFFER,
+                                 fb.rbo);
+  //
+  //
+  //
+  cl_int cl_err;
+
+  fb.mem = clCreateFromGLRenderbuffer(fb.context,
+                                      CL_MEM_WRITE_ONLY,
+                                      fb.rbo,
+                                      &cl_err); cl_ok(cl_err);
+  //
+  // for debugging porpoises!
+  //
+  cl_image_format format;
+
+  cl(GetImageInfo(fb.mem,
+                  CL_IMAGE_FORMAT,
+                  sizeof(format),
+                  &format,
+                  NULL));
+}
+
+//
+//
+//
+
+static
+void
+skc_interop_acquire()
+{
+  // frame buffer object
+  glCreateFramebuffers(1,&fb.fbo);
+
+  // render buffer object w/a color buffer
+  glCreateRenderbuffers(1,&fb.rbo);
+
+  // size rbo
+  glNamedRenderbufferStorage(fb.rbo,
+                             SKC_IMAGE_FORMAT,
+                             fb.width,
+                             fb.height);
+
+  // attach rbo to fbo
+  glNamedFramebufferRenderbuffer(fb.fbo,
+                                 GL_COLOR_ATTACHMENT0,
+                                 GL_RENDERBUFFER,
+                                 fb.rbo);
+}
+
+void
+skc_interop_register(skc_context_t context)
+{
+  fb.context = context->runtime->cl.context;
+}
+
+//
+//
+//
+
+void
+skc_interop_init(GLFWwindow * * window)
+{
+  //
+  // INITIALIZE GLFW/GLAD
+  //
+  glfwSetErrorCallback(skc_interop_error_callback);
+
+  if (!glfwInit())
+    exit(EXIT_FAILURE);
+
+  GLFWmonitor       * const primary = glfwGetPrimaryMonitor();
+  GLFWvidmode const * const mode    = glfwGetVideoMode(primary);
+
+  if (fb.is_fullscreen)
+    {
+      fb.width  = mode->width;
+      fb.height = mode->height;
+    }
+  else
+    {
+      fb.width  = 1600;
+      fb.height = 1024;
+    }
+
+  glfwWindowHint(GLFW_ALPHA_BITS,            0);
+  glfwWindowHint(GLFW_DEPTH_BITS,            0);
+  glfwWindowHint(GLFW_STENCIL_BITS,          0);
+
+  glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);
+
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
+
+  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);
+
+  *window = glfwCreateWindow(fb.width,fb.height,
+                             "Skia Compute",
+                             fb.is_fullscreen ? primary : NULL,
+                             NULL);
+
+  if (*window == NULL)
+    {
+      glfwTerminate();
+      exit(EXIT_FAILURE);
+    }
+
+  glfwMakeContextCurrent(*window);
+
+  // set up GLAD
+  gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
+
+  // ignore vsync for now
+  glfwSwapInterval(fb.is_vsync_on ? 1 : 0);
+
+  // only copy r/g/b
+  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
+
+  // enable SRGB, disable scissor
+  glEnable(GL_FRAMEBUFFER_SRGB);
+  glDisable(GL_SCISSOR_TEST);
+
+  //
+  // SET USER POINTER AND CALLBACKS
+  //
+  glfwSetKeyCallback            (*window,skc_interop_key_callback);
+  glfwSetFramebufferSizeCallback(*window,skc_interop_window_size_callback);
+  glfwSetScrollCallback         (*window,skc_interop_scroll_callback);
+  glfwSetCursorPosCallback      (*window,skc_interop_cursor_position_callback);
+  glfwSetWindowIconifyCallback  (*window,skc_interop_iconify_callback);
+
+  //
+  //
+  //
+  fprintf(stderr,
+          "GL_VENDOR   : %s\n"
+          "GL_RENDERER : %s\n",
+          glGetString(GL_VENDOR),
+          glGetString(GL_RENDERER));
+
+  //
+  // acquire an FBO/RBO
+  //
+  skc_interop_acquire();
+}
+
+//
+//
+//
+
+#define SKC_ROTATE_STEP ((float)(M_PI / 180.0))
+
+static
+void
+skc_interop_transform(struct skc_transform_stack * ts)
+{
+  // OpenGL'ism
+  skc_transform_stack_push_affine(ts,
+                                  1.0f, 0.0f,0.0f,
+                                  0.0f,-1.0f,(float)fb.height);
+  // multiply
+  skc_transform_stack_concat(ts);
+
+  // spinner...
+  if (fb.is_spinning)
+    fb.rotate_theta = fmodf(fb.rotate_theta + SKC_ROTATE_STEP,(float)(M_PI*2.0));
+  
+  // always rotate and scale around surface center point
+  skc_transform_stack_push_rotate_scale_xy(ts,
+                                           fb.rotate_theta,
+                                           fb.scale,fb.scale,
+                                           0.5f*fb.width,0.5f*fb.height);
+  skc_transform_stack_concat(ts);
+
+  // where did the mouse take us?
+  skc_transform_stack_push_translate(ts,
+                                     fb.translate.x,fb.translate.y);
+  skc_transform_stack_concat(ts);
+}
+
+
+void
+skc_interop_poll(GLFWwindow                 * window,
+                 struct skc_transform_stack * ts)
+{
+  // wait until uniconified
+  while (fb.is_iconified)
+    {
+      glfwWaitEvents();
+      continue;
+    }
+
+  // what's happended?
+  glfwPollEvents();
+
+  // resize?
+  if (fb.is_resized)
+    skc_interop_resize();
+
+  // monitor fps
+  skc_interop_fps(window);
+
+  skc_interop_transform(ts);
+}
+
+//
+//
+//
+
+void
+skc_interop_blit(GLFWwindow * window)
+{
+  // blit skc rbo
+  glBlitNamedFramebuffer(fb.fbo,0,
+                         0,0,fb.width,fb.height,
+                         0,0,fb.width,fb.height,
+                         GL_COLOR_BUFFER_BIT,
+                         GL_NEAREST);
+
+#if 0
+  //
+  // FIXME -- this clear does nothing!
+  //
+  // As a hack we're clearing the interop'd RBO with a
+  // clEnqueueFillImage().
+  //
+  float const rgba[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
+  // GLenum const attachments[] = { GL_COLOR_ATTACHMENT0 };
+  // glInvalidateNamedFramebufferData(fb.fbo,1,attachments);
+  glClearNamedFramebufferfv(fb.fbo,GL_COLOR,0,rgba);
+#endif
+
+  // swap buffers
+  glfwSwapBuffers(window);
+}
+
+//
+//
+//
+
+void *
+skc_interop_get_fb(GLFWwindow * window)
+{
+  glFlush();
+
+  return fb.mem;
+}
+
+//
+//
+//
+
+void
+skc_interop_get_dim(uint32_t dim[2])
+{
+  dim[0] = fb.width;
+  dim[1] = fb.height;
+}
+
+//
+//
+//
+
+
diff --git a/src/compute/skc/platforms/cl_12/gl/interop.h b/src/compute/skc/platforms/cl_12/gl/interop.h
new file mode 100644
index 0000000000..112d365764
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/gl/interop.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "skc.h"
+
+//
+//
+//
+
+void
+skc_interop_init(GLFWwindow * * window);
+
+void
+skc_interop_register(skc_context_t context);
+
+void
+skc_interop_poll(GLFWwindow                 * window,
+                 struct skc_transform_stack * ts);
+
+void *
+skc_interop_get_fb(GLFWwindow * window);
+
+void
+skc_interop_get_dim(uint32_t dim[2]);
+
+void
+skc_interop_blit(GLFWwindow * window);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c
new file mode 100644
index 0000000000..65288c3656
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stdio.h>
+#include <assert.h>
+
+//
+//
+//
+
+#include "common/cl/assert_cl.h"
+
+#include "block.h"
+#include "grid.h"
+#include "config_cl.h"
+#include "runtime_cl_12.h"
+
+//
+// FIXME -- these comments are now quite stale
+//
+//
+// HANDLE/ACQUIRE RELEASE
+//
+// The runtime vends handles just in case we decide to exploit shared
+// virtual memory.  But for most platforms and devices we will have a
+// pool of host-managed handles and on the device there will be a
+// table that maps the host handle to a device-managed memory block.
+//
+// HANDLE READINESS
+//
+// A host handle may reference a path or a raster which is not ready
+// for use further down the pipeline because it hasn't yet been
+// processed by the device.
+//
+// The simplest scheme for providing every handle a readiness state is
+// to build a map that that marks a new handle as being not-ready
+// while being processed by a particular grid id.  When the final
+// sub-pipeline grid responsible for the path or raster is complete,
+// then mark the handle as being ready and eventually return the grid
+// id back to the pool.  This can be performed on a separate thread.
+//
+// The side-benefit of this approach is that a handle's reference
+// count integral type can spare some bits for its associated grid id.
+//
+// A more memory-intensive approach uses a 64-bit epoch+grid key and
+// relies on the ~56 bits of epoch space to avoid any post
+// sub-pipeline status update by assuming that a handle and grid will
+// match or mismatch when queried.
+//
+
+#define SKC_HANDLE_REFCNT_HOST_BITS   (SKC_MEMBER_SIZE(union skc_handle_refcnt,h) * 8)
+#define SKC_HANDLE_REFCNT_DEVICE_BITS (SKC_MEMBER_SIZE(union skc_handle_refcnt,d) * 8)
+
+#define SKC_HANDLE_REFCNT_HOST_MAX    SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_HOST_BITS)
+#define SKC_HANDLE_REFCNT_DEVICE_MAX  SKC_BITS_TO_MASK(SKC_HANDLE_REFCNT_DEVICE_BITS)
+
+//
+//
+//
+
+static
+void
+skc_handle_reclaim_create(struct skc_runtime      * const runtime,
+                          struct skc_handle_pool  * const handle_pool,
+                          skc_handle_reclaim_type_e const reclaim_type,
+                          skc_device_kernel_id      const kernel_id)
+{
+  struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type;
+
+  // init counters
+  reclaim->bih.rem   = 0;
+
+  // acquire kernel
+  reclaim->kernel    = skc_device_acquire_kernel(runtime->device,kernel_id);
+  reclaim->kernel_id = kernel_id;
+
+  // set default args
+  cl(SetKernelArg(reclaim->kernel,0,SKC_CL_ARG(runtime->block_pool.ids.drw)));
+  cl(SetKernelArg(reclaim->kernel,1,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
+  cl(SetKernelArg(reclaim->kernel,2,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
+  cl(SetKernelArg(reclaim->kernel,3,SKC_CL_ARG(runtime->config->block_pool.ring_mask)));
+  cl(SetKernelArg(reclaim->kernel,4,SKC_CL_ARG(runtime->handle_pool.map.drw)));
+}
+
+static
+void
+skc_handle_reclaim_dispose(struct skc_runtime      * const runtime,
+                           skc_handle_reclaim_type_e const reclaim_type)
+{
+  struct skc_handle_reclaim * const reclaim = runtime->handle_pool.reclaim + reclaim_type;
+
+  cl(ReleaseKernel(reclaim->kernel));
+}
+
+//
+//
+//
+
+#define SKC_HANDLE_POOL_BLOCKS_PAD  8
+
+void
+skc_handle_pool_create(struct skc_runtime     * const runtime,
+                       struct skc_handle_pool * const handle_pool,
+                       skc_uint                 const size,
+                       skc_uint                 const width,
+                       skc_uint                 const recs)
+{
+  skc_uint const blocks         = (size + width - 1) / width;
+  skc_uint const blocks_padded  = blocks + SKC_HANDLE_POOL_BLOCKS_PAD;
+  skc_uint const handles        = blocks        * width;
+  skc_uint const handles_padded = blocks_padded * width;
+  skc_uint const recs_padded    = recs + 2; // one for pointer and one for head node
+
+  skc_extent_pdrw_alloc(runtime,&handle_pool->map,handles * sizeof(skc_block_id_t));
+
+  handle_pool->handle.indices   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles_padded * sizeof(*handle_pool->handle.indices));
+  handle_pool->handle.refcnts   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,handles        * sizeof(*handle_pool->handle.refcnts));
+  handle_pool->block.indices    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,blocks_padded  * sizeof(*handle_pool->block.indices));
+  handle_pool->recs             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,recs_padded    * sizeof(*handle_pool->recs));
+
+  // initialize handles and refcnts
+  for (skc_uint ii=0; ii<handles; ii++)
+    handle_pool->handle.indices[ii] = ii;
+
+  for (skc_uint ii=0; ii<handles; ii++)
+    handle_pool->handle.refcnts[ii].hd = 0;
+
+  handle_pool->handle.count     = handles;
+
+  // initialize block accounting
+  for (skc_uint ii=0; ii<blocks_padded; ii++)
+    handle_pool->block.indices[ii] = ii;
+
+  handle_pool->block.count      = blocks_padded;
+  handle_pool->block.width      = width;
+
+  handle_pool->block.tos        = blocks; // pop = pre-decrement  / push = post-increment
+  handle_pool->block.bos        = blocks; // pop = post-increment / push = pre-decrement
+
+  // initialize recs -- first two elements are interpreted differently
+  handle_pool->recs[0].runtime  = runtime;
+  handle_pool->recs[1]          = (union skc_handle_reclaim_rec){ .rem = recs, .head = 2 };
+
+  for (skc_uint ii=2; ii<recs_padded; ii++)
+    handle_pool->recs[ii] = (union skc_handle_reclaim_rec){ .index = ii, .next = ii+1 };
+
+  handle_pool->recs[recs_padded-1].next = SKC_UINT_MAX;
+
+  // initialize acquire
+  handle_pool->acquire.rem = 0;
+
+  // create reclaimers
+  skc_handle_reclaim_create(runtime,
+                            handle_pool,
+                            SKC_HANDLE_RECLAIM_TYPE_PATH,
+                            SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM);
+
+  skc_handle_reclaim_create(runtime,
+                            handle_pool,
+                            SKC_HANDLE_RECLAIM_TYPE_RASTER,
+                            SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM);
+}
+
+//
+//
+//
+
+void
+skc_handle_pool_dispose(struct skc_runtime     * const runtime,
+                        struct skc_handle_pool * const handle_pool)
+{
+  skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER);
+  skc_handle_reclaim_dispose(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH);
+
+  skc_runtime_host_perm_free(runtime,handle_pool->recs);
+  skc_runtime_host_perm_free(runtime,handle_pool->block.indices);
+  skc_runtime_host_perm_free(runtime,handle_pool->handle.refcnts);
+  skc_runtime_host_perm_free(runtime,handle_pool->handle.indices);
+
+  skc_extent_pdrw_free(runtime,&handle_pool->map);
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_handle_pool_block_readable_pop(struct skc_runtime     * const runtime,
+                                   struct skc_handle_pool * const handle_pool)
+{
+  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.tos == 0);
+
+  skc_uint const index = handle_pool->block.indices[--handle_pool->block.tos];
+
+#if 0
+  skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width;
+  for (skc_uint ii=0; ii<handle_pool->block.width; ii++)
+    printf("R-: %u\n",*--handles);
+#endif
+
+  return index;
+}
+
+static
+void
+skc_handle_pool_block_readable_push(struct skc_handle_pool * const handle_pool,
+                                    skc_uint                 const index)
+{
+  handle_pool->block.indices[handle_pool->block.tos++] = index;
+
+#if 0
+  skc_handle_t * handles = handle_pool->handle.indices + (index + 1) * handle_pool->block.width;
+  for (skc_uint ii=0; ii<handle_pool->block.width; ii++)
+    printf("R+: %u\n",*--handles);
+#endif
+}
+
+
+static
+skc_uint
+skc_handle_pool_block_writable_pop(struct skc_runtime     * const runtime,
+                                   struct skc_handle_pool * const handle_pool)
+{
+  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,handle_pool->block.bos == handle_pool->block.count);
+
+  return handle_pool->block.indices[handle_pool->block.bos++];
+}
+
+static
+void
+skc_handle_pool_block_writable_push(struct skc_handle_pool * const handle_pool,
+                                    skc_uint                 const block_idx)
+{
+  handle_pool->block.indices[--handle_pool->block.bos] = block_idx;
+}
+
+//
+// May need to acquire the path or raster handle *early* just to be
+// sure one exists
+//
+
+skc_handle_t
+skc_runtime_handle_device_acquire(struct skc_runtime * const runtime)
+{
+  struct skc_handle_pool * const handle_pool = &runtime->handle_pool;
+
+  // acquire a block of handles at a time
+  if (handle_pool->acquire.rem == 0)
+    {
+      skc_uint const block_idx = skc_handle_pool_block_readable_pop(runtime,handle_pool);
+
+      handle_pool->acquire.block   = block_idx;
+      handle_pool->acquire.rem     = handle_pool->block.width;
+      handle_pool->acquire.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width;
+    }
+
+  // load handle from next block slot
+  skc_uint     const rem    =  --handle_pool->acquire.rem;
+  skc_handle_t const handle = *--handle_pool->acquire.handles;
+
+  // initialize refcnt for handle
+  handle_pool->handle.refcnts[handle] = (union skc_handle_refcnt){ .h = 1, .d = 1 };
+
+  // if this was the last handle in the block then move the block id
+  // to the reclamation stack to be used as a scratchpad
+  if (rem == 0) {
+    skc_handle_pool_block_writable_push(handle_pool,handle_pool->acquire.block);
+  }
+
+  return handle;
+}
+
+//
+//
+//
+
+static
+void
+skc_handle_reclaim_completion(union skc_handle_reclaim_rec * const recN)
+{
+  // get root rec which contains pointer to runtime
+  union skc_handle_reclaim_rec * const rec0 = recN - recN->index;
+  union skc_handle_reclaim_rec * const rec1 = rec0 + 1;
+
+  // return block for reading
+  skc_handle_pool_block_readable_push(&rec0->runtime->handle_pool,recN->block);
+
+  // recN is new head of list
+  recN->next = rec1->head;
+  rec1->head = recN->index;
+  rec1->rem += 1;
+}
+
+static
+void
+skc_handle_reclaim_cb(cl_event event, cl_int status, union skc_handle_reclaim_rec * const recN)
+{
+  SKC_CL_CB(status);
+
+  union skc_handle_reclaim_rec * const rec0 = recN - recN->index;
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(rec0->runtime->scheduler,skc_handle_reclaim_completion,recN);
+}
+
+//
+// FIXME -- is there an issue launching on the host thread?
+//
+
+static
+void
+skc_handle_reclaim_launch(struct skc_runtime            * const runtime,
+                          struct skc_handle_pool        * const handle_pool,
+                          struct skc_handle_reclaim     * const reclaim,
+                          union  skc_handle_reclaim_rec * const recN)
+{
+  cl(SetKernelArg(reclaim->kernel,
+                  5,
+                  handle_pool->block.width * sizeof(skc_handle_t),
+                  reclaim->bih.handles));
+
+  // acquire a cq
+  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
+
+  cl_event complete;
+
+  // the kernel grid is shaped by the target device
+  skc_device_enqueue_kernel(runtime->device,
+                            reclaim->kernel_id,
+                            cq,
+                            reclaim->kernel,
+                            handle_pool->block.width,
+                            0,NULL,&complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_handle_reclaim_cb,recN));
+  cl(ReleaseEvent(complete));
+
+  // kickstart kernel execution
+  cl(Flush(cq));
+
+  // release the cq
+  skc_runtime_release_cq_in_order(runtime,cq);
+}
+
+//
+// reclaim a handle
+//
+
+static
+union skc_handle_reclaim_rec *
+skc_handle_acquire_reclaim_rec(struct skc_runtime     * const runtime,
+                               struct skc_handle_pool * const handle_pool)
+{
+  union skc_handle_reclaim_rec * const rec1 = handle_pool->recs + 1;
+
+  SKC_SCHEDULER_WAIT_WHILE(runtime->scheduler,rec1->rem == 0);
+
+  union skc_handle_reclaim_rec * const recN = handle_pool->recs + rec1->head;
+
+  rec1->head = recN->next;
+  rec1->rem -= 1;
+
+  // fprintf(stderr,"rec1->rem = %u\n",rec1->rem);
+
+  return recN;
+}
+
+static
+void
+skc_runtime_device_reclaim(struct skc_runtime        * const runtime,
+                           struct skc_handle_pool    * const handle_pool,
+                           struct skc_handle_reclaim * const reclaim,
+                           skc_handle_t                const handle)
+{
+  // grab a new block?
+  if (reclaim->bih.rem == 0)
+    {
+      skc_uint const block_idx = skc_handle_pool_block_writable_pop(runtime,handle_pool);
+
+      reclaim->bih.block   = block_idx;
+      reclaim->bih.rem     = handle_pool->block.width;
+      reclaim->bih.handles = handle_pool->handle.indices + (block_idx + 1) * handle_pool->block.width;
+    }
+
+  // store handle -- handle's refcnt was already set to {0:0}
+  *--reclaim->bih.handles = handle;
+
+  // if block is full then launch reclamation kernel
+  if (--reclaim->bih.rem == 0)
+    {
+      union skc_handle_reclaim_rec * recN = skc_handle_acquire_reclaim_rec(runtime,handle_pool);
+
+      recN->block = reclaim->bih.block;
+
+      skc_handle_reclaim_launch(runtime,handle_pool,reclaim,recN);
+    }
+}
+
+//
+// Validate host-provided handles before retaining.
+//
+// Retain validation consists of:
+//
+//   - correct handle type
+//   - handle is in range of pool
+//   - host refcnt is not zero
+//   - host refcnt is not at the maximum value
+//
+// After validation, retain the handles for the host
+//
+
+static
+skc_err
+skc_runtime_handle_host_validated_retain(struct skc_runtime       * const runtime,
+                                         skc_typed_handle_type_e    const handle_type,
+                                         skc_typed_handle_t const * const typed_handles,
+                                         uint32_t                   const count)
+{
+  //
+  // FIXME -- test to make sure handles aren't completely out of range integers
+  //
+
+  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
+
+  for (skc_uint ii=0; ii<count; ii++)
+    {
+      skc_typed_handle_t const typed_handle = typed_handles[ii];
+
+      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,handle_type))
+        {
+          return SKC_ERR_HANDLE_INVALID;
+        }
+      else
+        {
+          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
+
+          if (handle >= runtime->handle_pool.handle.count)
+            {
+              return SKC_ERR_HANDLE_INVALID;
+            }
+          else
+            {
+              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
+              skc_uint                  const host       = refcnt_ptr->h;
+
+              if (host == 0)
+                {
+                  return SKC_ERR_HANDLE_INVALID;
+                }
+              else if (host == SKC_HANDLE_REFCNT_HOST_MAX)
+                {
+                  return SKC_ERR_HANDLE_OVERFLOW;
+                }
+            }
+        }
+    }
+
+  //
+  // all the handles validated, so retain them all..
+  //
+  for (skc_uint ii=0; ii<count; ii++)
+    refcnts[SKC_TYPED_HANDLE_TO_HANDLE(typed_handles[ii])].h++;
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_path_host_retain(struct skc_runtime * const runtime,
+                             skc_path_t   const *       paths,
+                             uint32_t                   count)
+{
+  return skc_runtime_handle_host_validated_retain(runtime,
+                                                  SKC_TYPED_HANDLE_TYPE_IS_PATH,
+                                                  paths,
+                                                  count);
+}
+
+skc_err
+skc_runtime_raster_host_retain(struct skc_runtime * const runtime,
+                               skc_path_t   const *       rasters,
+                               uint32_t                   count)
+{
+  return skc_runtime_handle_host_validated_retain(runtime,
+                                                  SKC_TYPED_HANDLE_TYPE_IS_RASTER,
+                                                  rasters,
+                                                  count);
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_raster_host_flush(struct skc_runtime * const runtime,
+                               skc_raster_t const *       rasters,
+                               uint32_t                   count)
+{
+  skc_grid_deps_force(runtime->deps,rasters,count);
+
+  return SKC_ERR_SUCCESS;
+}
+
+skc_err
+skc_runtime_path_host_flush(struct skc_runtime * const runtime,
+                            skc_path_t   const *       paths,
+                            uint32_t                   count)
+{
+  skc_grid_deps_force(runtime->deps,paths,count);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+// Validate host-provided handles before releasing.
+//
+// Release validation consists of:
+//
+//   - correct handle type
+//   - handle is in range of pool
+//   - host refcnt is not zero
+//
+// After validation, release the handles for the host
+//
+
+static
+skc_err
+skc_runtime_host_validated_release(struct skc_runtime       * const runtime,
+                                   skc_typed_handle_type_e    const type,
+                                   skc_handle_reclaim_type_e  const reclaim_type,
+                                   skc_typed_handle_t const * const handles,
+                                   uint32_t                   const count)
+{
+  struct skc_handle_pool   * const handle_pool = &runtime->handle_pool;
+  union  skc_handle_refcnt * const refcnts     = handle_pool->handle.refcnts;
+
+  for (skc_uint ii=0; ii<count; ii++)
+    {
+      skc_typed_handle_t const typed_handle = handles[ii];
+
+      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type))
+        {
+          return SKC_ERR_HANDLE_INVALID;
+        }
+      else
+        {
+          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
+
+          if (handle >= handle_pool->handle.count)
+            {
+              return SKC_ERR_HANDLE_INVALID;
+            }
+          else
+            {
+              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
+              skc_uint                  const host       = refcnt_ptr->h;
+
+              if (host == 0)
+                {
+                  return SKC_ERR_HANDLE_INVALID;
+                }
+            }
+        }
+    }
+
+  //
+  // all the handles validated, so release them all..
+  //
+  struct skc_handle_reclaim * const reclaim = handle_pool->reclaim + reclaim_type;
+
+  for (skc_uint ii=0; ii<count; ii++)
+    {
+      skc_handle_t              const handle     = SKC_TYPED_HANDLE_TO_HANDLE(handles[ii]);
+      union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
+      union skc_handle_refcnt         refcnt     = *refcnt_ptr;
+
+      refcnt.h   -= 1;
+      *refcnt_ptr = refcnt;
+
+      if (refcnt.hd == 0) {
+        skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle);
+      }
+    }
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_path_host_release(struct skc_runtime * const runtime,
+                              skc_path_t   const *       paths,
+                              uint32_t                   count)
+{
+  return skc_runtime_host_validated_release(runtime,
+                                            SKC_TYPED_HANDLE_TYPE_IS_PATH,
+                                            SKC_HANDLE_RECLAIM_TYPE_PATH,
+                                            paths,
+                                            count);
+}
+
+skc_err
+skc_runtime_raster_host_release(struct skc_runtime * const runtime,
+                                skc_raster_t const *       rasters,
+                                uint32_t                   count)
+{
+  return skc_runtime_host_validated_release(runtime,
+                                            SKC_TYPED_HANDLE_TYPE_IS_RASTER,
+                                            SKC_HANDLE_RECLAIM_TYPE_RASTER,
+                                            rasters,
+                                            count);
+}
+
+//
+// Validate host-provided handles before retaining on the device.
+//
+//   - correct handle type
+//   - handle is in range of pool
+//   - host refcnt is not zero
+//   - device refcnt is not at the maximum value
+//
+
+skc_err
+skc_runtime_handle_device_validate_retain(struct skc_runtime       * const runtime,
+                                          skc_typed_handle_type_e    const type,
+                                          skc_typed_handle_t const *       handles,
+                                          uint32_t                         count)
+{
+  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
+
+  while (count-- > 0)
+    {
+      skc_typed_handle_t const typed_handle = *handles++;
+
+      if (!SKC_TYPED_HANDLE_IS_TYPE(typed_handle,type))
+        {
+          return SKC_ERR_HANDLE_INVALID;
+        }
+      else
+        {
+          skc_handle_t const handle = SKC_TYPED_HANDLE_TO_HANDLE(typed_handle);
+
+          if (handle >= runtime->handle_pool.handle.count)
+            {
+              return SKC_ERR_HANDLE_INVALID;
+            }
+          else
+            {
+              union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
+              union skc_handle_refcnt         refcnt     = *refcnt_ptr;
+
+              if (refcnt.h == 0)
+                {
+                  return SKC_ERR_HANDLE_INVALID;
+                }
+              else if (refcnt.d == SKC_HANDLE_REFCNT_DEVICE_MAX)
+                {
+                  return SKC_ERR_HANDLE_OVERFLOW;
+                }
+            }
+        }
+    }
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+// After validation, retain the handles for the device
+//
+
+void
+skc_runtime_handle_device_retain(struct skc_runtime * const runtime,
+                                 skc_handle_t const *       handles,
+                                 uint32_t                   count)
+{
+  union skc_handle_refcnt * const refcnts = runtime->handle_pool.handle.refcnts;
+
+  while (count-- > 0)
+    refcnts[SKC_TYPED_HANDLE_TO_HANDLE(*handles++)].d++;
+}
+
+//
+// Release the device-held handles -- no validation required!
+//
+
+static
+void
+skc_runtime_handle_device_release(struct skc_runtime      * const runtime,
+                                  skc_handle_reclaim_type_e const reclaim_type,
+                                  skc_handle_t      const *       handles,
+                                  skc_uint                        count)
+{
+  struct skc_handle_pool    * const handle_pool = &runtime->handle_pool;
+  union  skc_handle_refcnt  * const refcnts     = handle_pool->handle.refcnts;
+  struct skc_handle_reclaim * const reclaim     = handle_pool->reclaim + reclaim_type;
+
+  while (count-- > 0) {
+    skc_handle_t              const handle     = *handles++;
+    union skc_handle_refcnt * const refcnt_ptr = refcnts + handle;
+    union skc_handle_refcnt         refcnt     = *refcnt_ptr;
+
+    refcnt.d   -= 1;
+    *refcnt_ptr = refcnt;
+
+#if 0
+    printf("%8u = { %u, %u }\n",handle,refcnt.h,refcnt.d);
+#endif
+
+    if (refcnt.hd == 0) {
+      skc_runtime_device_reclaim(runtime,handle_pool,reclaim,handle);
+    }
+  }
+}
+
+//
+//
+//
+
+void
+skc_runtime_path_device_release(struct skc_runtime * const runtime,
+                                skc_handle_t const *       handles,
+                                skc_uint                   count)
+{
+  skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_PATH,handles,count);
+}
+
+void
+skc_runtime_raster_device_release(struct skc_runtime * const runtime,
+                                  skc_handle_t const *       handles,
+                                  skc_uint                   count)
+{
+  skc_runtime_handle_device_release(runtime,SKC_HANDLE_RECLAIM_TYPE_RASTER,handles,count);
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h
new file mode 100644
index 0000000000..4fefae3552
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/handle_pool_cl_12.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "macros.h"
+#include "handle.h"
+#include "extent_cl_12.h"
+#include "device_cl_12.h"
+
+//
+// FIXME -- THIS DOCUMENTATION IS STALE NOW THAT A REFERENCE COUNT REP
+// IS A {HOST:DEVICE} PAIR.
+//
+// Host-side handle pool
+//
+// The bulk size of the three extents is currently 6 bytes of overhead
+// per number of host handles.  The number of host handles is usually
+// less than the number of blocks in the pool.  Note that the maximum
+// number of blocks is 2^27.
+//
+// A practical instantiation might provide a combined 2^20 path and
+// raster host handles. This would occupy 6 MB of host RAM for the
+// 32-bit handle, 8-bit reference count and 8-bit handle-to-grid map.
+//
+// Also note that we could use isolated/separate path and raster block
+// pools. Worst case, this would double the memory footprint of SKC.
+//
+// Host-side handle reference count
+//
+//   [0      ] : release
+//   [1..UMAX] : retain
+//
+// In a garbage-collected environment we might want to rely on an
+// existing mechanism for determing whether a handle is live.
+//
+// Otherwise, we probably want to have a 16 or 32-bit ref count.
+//
+// The handle reference count is defensive and will not allow the host
+// to underflow a handle that's still retained by the pipeline.
+//
+// The single reference counter is split into host and device counts.
+//
+
+union skc_handle_refcnt
+{
+  skc_ushort  hd; // host and device
+
+  struct {
+    skc_uchar h;  // host
+    skc_uchar d;  // device
+  };
+};
+
+SKC_STATIC_ASSERT(SKC_MEMBER_SIZE(union skc_handle_refcnt,hd) ==
+                  SKC_MEMBER_SIZE(union skc_handle_refcnt,h) +
+                  SKC_MEMBER_SIZE(union skc_handle_refcnt,d));
+
+//
+//
+//
+
+struct skc_handle_bih
+{
+  skc_uint       block;
+  skc_uint       rem;
+  skc_handle_t * handles;
+};
+
+struct skc_handle_reclaim
+{
+  struct skc_handle_bih bih;
+
+  cl_kernel             kernel;
+  skc_device_kernel_id  kernel_id;
+};
+
+union skc_handle_reclaim_rec
+{
+  // ELEMENT  0
+  struct skc_runtime * runtime;
+  
+  // ELEMENT  1
+  struct {
+    skc_uint           rem;   // # of available records
+    skc_uint           head;  // index of first record
+  };
+
+  // ELEMENTS 2+
+  struct {
+    skc_uint           index; // index of this record -- never modified
+    union {
+      skc_uint         next;  // index of next record
+      skc_uint         block; // block index of reclaimed handles
+    };
+  };
+};
+
+SKC_STATIC_ASSERT(sizeof(union skc_handle_reclaim_rec) == sizeof(skc_uint2));
+
+//
+//
+//
+
+typedef enum skc_handle_reclaim_type_e {
+
+  SKC_HANDLE_RECLAIM_TYPE_PATH,
+  SKC_HANDLE_RECLAIM_TYPE_RASTER,
+
+  SKC_HANDLE_RECLAIM_TYPE_COUNT
+
+} skc_handle_reclaim_type_e;
+
+struct skc_handle_pool
+{
+  //
+  // FIXME -- should we be pedantic and make these always-host-side
+  // allocations "extents" as well?  I think it's OK not being an
+  // extent structure for now and is mostly consistent with the rest
+  // of the code.
+  //
+  // FIXME -- the cbs[] array is a little idiosyncratic but the intent
+  // is to avoid storing the 64-bit backpointer inside of every single
+  // record.  This can be harmonized later.  Note that only a few
+  // hundred outstanding callbacks would represent many many subgroups
+  // of work and would fully occupy the GPU (if we allow it).
+  //
+  //
+  struct skc_extent_pdrw         map;     // device-managed extent mapping a host handle to device block id
+
+  struct {
+    skc_handle_t               * indices; // array of individual host handles -- fragmented into blocks
+    union skc_handle_refcnt    * refcnts; // array of reference counts indexed by an individual handle
+    skc_uint                     count;
+  } handle;
+
+  struct {
+    skc_uint                   * indices; // stack of indices to fixed-size blocks of host handles
+    skc_uint                     count;   // number of handles -- valid from [0,size)
+    skc_uint                     width;   // width of a fixed-size block of handles
+    skc_uint                     tos;     // grows upward   / push++ / --pop / # fixed-size blocks for reading
+    skc_uint                     bos;     // grows downward / --push / pop++ / # fixed-size blocks for writing
+  } block;
+
+  union skc_handle_reclaim_rec * recs;    // array of reclaim records
+
+  struct skc_handle_bih          acquire;
+  struct skc_handle_reclaim      reclaim[SKC_HANDLE_RECLAIM_TYPE_COUNT];
+};
+
+//
+//
+//
+
+void
+skc_handle_pool_create(struct skc_runtime     * const runtime,
+                       struct skc_handle_pool * const handle_pool,
+                       skc_uint                 const size,
+                       skc_uint                 const width,
+                       skc_uint                 const recs);
+
+void
+skc_handle_pool_dispose(struct skc_runtime     * const runtime,
+                        struct skc_handle_pool * const handle_pool);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
new file mode 100644
index 0000000000..726b0a7907
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "device_cl_12.h"
+
+//
+// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
+//
+
+__kernel
+SKC_BP_INIT_IDS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
+{
+  uint const gid = get_global_id(0);
+
+  //
+  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
+  // accomplish this with fewer threads and using either IPC and/or
+  // vector stores -- it should be on certain architectures!
+  //
+
+  //
+  // initialize pool with sequence
+  //
+  if (gid < bp_size)
+    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
+}
+
+//
+//
+//
+
+__kernel
+SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
+{
+  // the version test is to squelch a bug with the Intel OpenCL CPU
+  // compiler declaring it supports the cl_intel_subgroups extension
+#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
+  uint const tid = get_sub_group_local_id();
+#else
+  uint const tid = get_local_id(0);
+#endif
+
+  //
+  // launch two threads and store [ 0, bp_size ]
+  //
+  bp_atomics[tid] = tid * bp_size;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h
new file mode 100644
index 0000000000..e68579c0f7
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/avx2/device_cl_12_avx2.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_DEVICE_CL_12_AVX2_H
+#define SKC_ONCE_DEVICE_CL_12_AVX2_H
+
+//
+//
+//
+
+#define SKC_DEVICE_BLOCK_WORDS_LOG2         6
+#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2      4
+
+//
+//
+//
+
+#define SKC_DEVICE_BLOCK_WORDS              (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
+#define SKC_DEVICE_SUBBLOCK_WORDS           (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+//
+//
+//
+
+#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK      (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)
+
+//
+//
+//
+
+#define SKC_COPY_PATHS_THREADS_PER_BLOCK    SKC_DEVICE_SUBBLOCK_WORDS
+#define SKC_COPY_PATHS_ELEM_WORDS           1
+
+//
+//
+//
+
+#define SKC_EXPAND_FILLS_THREADS_PER_BLOCK  SKC_DEVICE_SUBBLOCK_WORDS
+#define SKC_EXPAND_FILLS_ELEM_WORDS         1
+
+//
+//
+//
+
+#define SKC_RASTERIZE_THREADS_PER_BLOCK     SKC_DEVICE_SUBBLOCK_WORDS
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
new file mode 100644
index 0000000000..aebe8fdc1d
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
@@ -0,0 +1,938 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "common/cl/assert_cl.h"
+
+#include "tile.h"
+#include "raster.h"
+#include "macros.h"
+
+#include "config_cl.h"
+#include "runtime_cl_12.h"
+
+#include "device_cl_12.h"
+
+#include "hs/cl/hs_cl_launcher.h"
+#include "hs/cl/gen9/hs_cl.h"
+
+//
+//
+//
+
+#define SKC_KERNEL_SPIRV  0
+#define SKC_KERNEL_BINARY 1
+#define SKC_KERNEL_SRC    0
+
+//
+//
+//
+
+#if   SKC_KERNEL_SPIRV
+
+#include "inl/block_pool_init.pre.spv.inl"
+#include "inl/paths_copy.pre.spv.inl"
+#include "inl/fills_expand.pre.spv.inl"
+#include "inl/rasterize.pre.spv.inl"
+#include "inl/segment_ttrk.pre.spv.inl"
+#include "inl/rasters_alloc.pre.spv.inl"
+#include "inl/prefix.pre.spv.inl"
+#include "inl/place.pre.spv.inl"
+#include "inl/segment_ttck.pre.spv.inl"
+#include "inl/render.pre.spv.inl"
+#include "inl/paths_reclaim.pre.spv.inl"
+#include "inl/rasters_reclaim.pre.spv.inl"
+
+#elif SKC_KERNEL_BINARY
+
+#include "inl/block_pool_init.pre.bin.inl"
+#include "inl/paths_copy.pre.bin.inl"
+#include "inl/fills_expand.pre.bin.inl"
+#include "inl/rasterize.pre.bin.inl"
+#include "inl/segment_ttrk.pre.bin.inl"
+#include "inl/rasters_alloc.pre.bin.inl"
+#include "inl/prefix.pre.bin.inl"
+#include "inl/place.pre.bin.inl"
+#include "inl/segment_ttck.pre.bin.inl"
+#include "inl/render.pre.bin.inl"
+#include "inl/paths_reclaim.pre.bin.inl"
+#include "inl/rasters_reclaim.pre.bin.inl"
+
+#elif SKC_KERNEL_SRC
+
+#include "inl/block_pool_init.pre.src.inl"
+#include "inl/paths_copy.pre.src.inl"
+#include "inl/fills_expand.pre.src.inl"
+#include "inl/rasterize.pre.src.inl"
+#include "inl/segment_ttrk.pre.src.inl"
+#include "inl/rasters_alloc.pre.src.inl"
+#include "inl/prefix.pre.src.inl"
+#include "inl/place.pre.src.inl"
+#include "inl/segment_ttck.pre.src.inl"
+#include "inl/render.pre.src.inl"
+#include "inl/paths_reclaim.pre.src.inl"
+#include "inl/rasters_reclaim.pre.src.inl"
+
+#endif
+
+//
+// FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY
+//
+
+static 
+struct skc_config const config =
+  {
+    .suballocator = {
+      .host = {
+        .size       = 1024 * 1024, // words
+        .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
+      },
+      .device = {
+        .size       = 128 * 1024 * 1024,
+        .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
+      }
+    }, 
+
+    .scheduler = {
+      .size         = 4096 // 128 // fixme -- this is just for testing -- too big
+    },
+
+    .subblock = {
+      .words        = SKC_DEVICE_SUBBLOCK_WORDS,                         // words per subblock -- pow2
+      .bytes        = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint)       // bytes per subblock -- pow2
+    },
+
+    .block = {
+      .words        = SKC_DEVICE_BLOCK_WORDS,                            // words per block     -- pow2
+      .bytes        = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint),         // bytes per block     -- pow2
+      .subblocks    = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes
+    },
+
+    .block_pool = {
+      .pool_size    = 524288, // blocks in pool -- 128 MB
+      .ring_pow2    = 524288, // blocks in pool rounded up pow2
+      .ring_mask    = 524288 - 1
+    },
+
+    .cq_pool     = {
+#ifndef NDEBUG
+      .type         = SKC_CQ_TYPE_IN_ORDER_PROFILING,
+#else
+      .type         = 0,
+#endif
+      .size         = 8
+    },
+
+    .handle_pool = {
+      .size         = 262144,  // large fraction of block pool size (for now, 1:2)
+      .width        = SKC_RECLAIM_ARRAY_SIZE,
+      .recs         = 256      // too many?  too few?
+    },
+
+    .tile = {
+      .width        = SKC_TILE_WIDTH,                  // tile width  in pixels
+      .height       = SKC_TILE_HEIGHT,                 // tile height in pixels
+      .ratio        = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB
+    },
+
+    .paths_copy = {
+
+      .buffer = {
+        .count      = 16   // # of subbufs in buffer
+      },
+
+      .subbuf = {
+        .count      = 1024 // # of blocks/commands in subbuf
+      },
+
+      .block = {
+        .subbuf     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024,     // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+        .buffer     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count
+      },
+
+      .command = {
+        .subbuf     = sizeof(skc_uint) * 1024,     // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+        .buffer     = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count
+      },
+
+      // skc_uint paths_lowat;
+    },
+
+    .raster_cohort = {
+      .path_ids = {
+        .elem_count = 8192,
+        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+      },
+
+      .transforms = {
+        .elem_count = 8192,
+        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+      },
+
+      .clips = {
+        .elem_count = 8192,
+        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+      },
+
+      .fill = {
+        .elem_count = 8192,
+        .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+      },
+
+      .raster_ids = {
+        .elem_count = 8192,
+        .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256
+      },
+
+      .expand = {
+        .cmds       = 1024*128,
+      },
+
+      .rasterize = {
+        .keys       = 1024*1024
+      }
+    },
+
+    .composition = {
+      .cmds = {
+        .elem_count = 1024*16,
+        .snap_count = 1024
+      },
+      .raster_ids = {
+        .elem_count = 1024*1024
+      },
+      .keys = {
+        .elem_count = 1024*1024,
+      }
+    },
+  };
+
+//
+//
+//
+
+static char const cl_build_options_optimized[] =
+  "-cl-std=CL1.2 "
+  "-cl-single-precision-constant "
+  "-cl-denorms-are-zero "
+  "-cl-mad-enable "
+  "-cl-no-signed-zeros "
+  "-cl-fast-relaxed-math "
+  "-cl-kernel-arg-info ";
+
+static char const cl_build_options_debug[] =
+  "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc";
+
+// #define SKC_BUILD_OPTIONS cl_build_options_debug
+#define SKC_BUILD_OPTIONS    cl_build_options_optimized
+
+//
+//
+//
+
+struct skc_program_source
+{
+  char   const * name;
+  char   const * options;
+  char   const * src;
+  size_t const   srclen;
+};
+
+//
+// THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE
+// AND ITS BUILD OPTIONS
+//
+
+union skc_program_sources
+{
+  struct {
+    struct skc_program_source block_pool_init;
+    struct skc_program_source paths_copy;
+    struct skc_program_source fills_expand;
+    struct skc_program_source rasterize;
+    struct skc_program_source segment_ttrk;
+    struct skc_program_source rasters_alloc;
+    struct skc_program_source prefix;
+    struct skc_program_source place;
+    struct skc_program_source segment_ttck;
+    struct skc_program_source render;
+    struct skc_program_source paths_reclaim;
+    struct skc_program_source rasters_reclaim;
+  };
+  struct skc_program_source   sources[];
+};
+
+typedef size_t * (*skc_grid_shaper)(size_t    const work_size,
+                                    cl_uint * const work_dim,
+                                    size_t  * const global_work_size,
+                                    size_t  * const local_work_size);
+struct skc_program_kernel
+{
+  char const *         name;
+  skc_grid_shaper      shaper;
+  skc_device_kernel_id id;
+};
+
+union skc_program_kernels
+{
+  struct {
+    struct skc_program_kernel block_pool_init[2];
+    struct skc_program_kernel paths_copy     [2];
+    struct skc_program_kernel fills_expand   [1];
+    struct skc_program_kernel rasterize      [6];
+    struct skc_program_kernel segment_ttrk   [1];
+    struct skc_program_kernel rasters_alloc  [1];
+    struct skc_program_kernel prefix         [1];
+    struct skc_program_kernel place          [1];
+    struct skc_program_kernel segment_ttck   [1];
+    struct skc_program_kernel render         [1];
+    struct skc_program_kernel paths_reclaim  [1];
+    struct skc_program_kernel rasters_reclaim[1];
+  };
+  struct skc_program_kernel   kernels[];
+};
+
+//
+//
+//
+
+#if     SKC_KERNEL_SPIRV  // PROGRAM IS SPIR-V
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_spv
+#elif   SKC_KERNEL_BINARY // PROGRAM IS BINARY
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_ir
+#elif   SKC_KERNEL_SRC    // PROGRAM IS SOURCE CODE
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_cl
+#else
+#error  "SKC_KERNEL_???"
+#endif
+
+//
+//
+//
+
+#define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) }
+#define SKC_PROGRAM_SOURCE(k,o)          SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o)
+#define SKC_PROGRAM_KERNEL(k)            "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k)
+
+//
+//
+//
+
+static
+size_t *
+skc_device_shaper_block_pool_init_ids(size_t    const work_size,
+                                      cl_uint * const work_dim,
+                                      size_t  * const work_global,
+                                      size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = work_size;
+
+  return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_block_pool_init_atomics(size_t    const work_size,
+                                          cl_uint * const work_dim,
+                                          size_t  * const work_global,
+                                          size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = 2;
+
+  return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_paths_alloc(size_t    const work_size,
+                              cl_uint * const work_dim,
+                              size_t  * const work_global,
+                              size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = 1;
+
+  return NULL; // let runtime figure out local work size
+}
+
+
+static
+size_t *
+skc_device_shaper_paths_copy(size_t    const work_size,
+                             cl_uint * const work_dim,
+                             size_t  * const work_global,
+                             size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size;
+#if 0
+  work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE;
+
+  return work_local;
+#else  
+  return NULL; // let runtime figure out local work size
+#endif
+}
+
+static
+size_t *
+skc_device_shaper_fills_expand(size_t    const work_size,
+                               cl_uint * const work_dim,
+                               size_t  * const work_global,
+                               size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size;
+  work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_rasterize(size_t    const work_size,
+                            cl_uint * const work_dim,
+                            size_t  * const work_global,
+                            size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size;
+  work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_all(size_t    const work_size,
+                                cl_uint * const work_dim,
+                                size_t  * const work_global,
+                                size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_lines(size_t    const work_size,
+                                  cl_uint * const work_dim,
+                                  size_t  * const work_global,
+                                  size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_quads(size_t    const work_size,
+                                  cl_uint * const work_dim,
+                                  size_t  * const work_global,
+                                  size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_cubics(size_t    const work_size,
+                                   cl_uint * const work_dim,
+                                   size_t  * const work_global,
+                                   size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_rat_quads(size_t    const work_size,
+                                      cl_uint * const work_dim,
+                                      size_t  * const work_global,
+                                      size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_rat_cubics(size_t    const work_size,
+                                       cl_uint * const work_dim,
+                                       size_t  * const work_global,
+                                       size_t  * const work_local)
+{
+  return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasters_alloc(size_t    const work_size,
+                                cl_uint * const work_dim,
+                                size_t  * const work_global,
+                                size_t  * const work_local)
+{
+  // round up to whole groups
+  size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE);
+
+  work_dim   [0] = 1;
+  work_global[0] = gs;
+  work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_segment_ttrk(size_t    const work_size,
+                               cl_uint * const work_dim,
+                               size_t  * const work_global,
+                               size_t  * const work_local)
+{
+  // work_size is number of keys -- round up to a whole slab
+  size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+
+  work_dim   [0] = 1;
+  work_global[0] = keys_ru / HS_KEYS_PER_LANE;
+  work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_segment_ttck(size_t    const work_size,
+                               cl_uint * const work_dim,
+                               size_t  * const work_global,
+                               size_t  * const work_local)
+{
+  // work_size is number of keys -- round up to a whole slab
+  size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+
+  work_dim   [0] = 1;
+  work_global[0] = keys_ru / HS_KEYS_PER_LANE;
+  work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_prefix(size_t    const work_size,
+                         cl_uint * const work_dim,
+                         size_t  * const work_global,
+                         size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size;
+  work_local [0] = SKC_PREFIX_SUBGROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_place(size_t    const work_size,
+                        cl_uint * const work_dim,
+                        size_t  * const work_global,
+                        size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size;
+  work_local [0] = SKC_PLACE_SUBGROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_render(size_t    const work_size,
+                         cl_uint * const work_dim,
+                         size_t  * const work_global,
+                         size_t  * const work_local)
+{
+  work_dim   [0] = 1;
+  work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size;
+  work_local [0] = SKC_RENDER_SUBGROUP_SIZE;
+
+  return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_paths_reclaim(size_t    const work_size,
+                                cl_uint * const work_dim,
+                                size_t  * const work_global,
+                                size_t  * const work_local)
+{
+  assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
+
+  work_dim   [0] = 1;
+  work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
+
+  return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_rasters_reclaim(size_t    const work_size,
+                                  cl_uint * const work_dim,
+                                  size_t  * const work_global,
+                                  size_t  * const work_local)
+{
+  assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
+
+  work_dim   [0] = 1;
+  work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
+
+  return NULL; // let runtime figure out local work size
+}
+
+//
+//
+//
+
+static union skc_program_sources const program_sources = {
+  SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(paths_copy,     SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(fills_expand,   SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(rasterize,      SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(segment_ttrk,   SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(rasters_alloc,  SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(prefix,         SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(place,          SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(segment_ttck,   SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(render,         SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(paths_reclaim,  SKC_BUILD_OPTIONS),
+  SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS)
+};
+
+static union skc_program_kernels const program_kernels = {
+
+  .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids),     SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS     },
+                       { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } },
+
+  .paths_copy      = { { SKC_PROGRAM_KERNEL(paths_alloc),             SKC_DEVICE_KERNEL_ID_PATHS_ALLOC             },
+                       { SKC_PROGRAM_KERNEL(paths_copy) ,             SKC_DEVICE_KERNEL_ID_PATHS_COPY              } },
+
+  .fills_expand    = { { SKC_PROGRAM_KERNEL(fills_expand),            SKC_DEVICE_KERNEL_ID_FILLS_EXPAND            } },
+
+  .rasterize       = { { SKC_PROGRAM_KERNEL(rasterize_all),           SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL           },
+                       { SKC_PROGRAM_KERNEL(rasterize_lines),         SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES         },
+                       { SKC_PROGRAM_KERNEL(rasterize_quads),         SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS         },
+                       { SKC_PROGRAM_KERNEL(rasterize_cubics),        SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS        },
+                       { SKC_PROGRAM_KERNEL(rasterize_rat_quads),     SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS     },
+                       { SKC_PROGRAM_KERNEL(rasterize_rat_cubics),    SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS    } },
+
+  .segment_ttrk    = { { SKC_PROGRAM_KERNEL(segment_ttrk),            SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK            } },
+
+  .rasters_alloc   = { { SKC_PROGRAM_KERNEL(rasters_alloc),           SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC           } },
+
+  .prefix          = { { SKC_PROGRAM_KERNEL(prefix),                  SKC_DEVICE_KERNEL_ID_PREFIX                  } },
+
+  .place           = { { SKC_PROGRAM_KERNEL(place),                   SKC_DEVICE_KERNEL_ID_PLACE                   } },
+
+  .segment_ttck    = { { SKC_PROGRAM_KERNEL(segment_ttck) ,           SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK            } },
+
+  .render          = { { SKC_PROGRAM_KERNEL(render),                  SKC_DEVICE_KERNEL_ID_RENDER                  } },
+
+  .paths_reclaim   = { { SKC_PROGRAM_KERNEL(paths_reclaim),           SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM           } },
+
+  .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim),         SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM         } }
+};
+
+//
+//
+//
+
+struct skc_device
+{
+  //
+  // FIXME -- an OpenCL 2.1+ device would clone these kernels in a
+  // multithreaded system.  
+  //
+  // Not having the ability to clone kernels (yet set their sticky
+  // args) was an oversight in previous versions of OpenCL.
+  //
+  // For now, we can probably get away with just a single kernel
+  // instance as long as args are set and the kernel is launched
+  // before having its arguments stomped on.
+  //
+  cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT];
+  size_t    reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3];
+};
+
+//
+// CREATE KERNELS
+//
+
+static
+void
+skc_device_create_kernels(struct skc_runtime              * const runtime,
+                          struct skc_program_kernel const * const kernels,
+                          skc_uint                          const kernel_count,
+                          cl_program                              program)
+{
+  for (skc_uint ii=0; ii<kernel_count; ii++)
+    {
+      cl_int cl_err;
+
+      char     const * name = kernels[ii].name;
+      skc_uint const   id   = kernels[ii].id;
+
+      fprintf(stderr,"\t\"%s\"\n",name);
+
+      // create the kernel
+      runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err);
+
+      //
+      // release program now
+      //
+      // FIXME -- if/when we multithread then we need to clone kernels
+      // (>=2.1) or keep programs around (<=2.0)
+      //
+
+      // get workgroup size
+      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+                                runtime->cl.device_id,
+                                CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
+                                sizeof(runtime->device->reqd_szs[0]),
+                                runtime->device->reqd_szs[id],
+                                NULL));
+
+      //
+      // GEN9+ PROBING
+      //
+#define SKC_TARGET_GEN9
+#ifdef  SKC_TARGET_GEN9
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL         0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL          0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL  0x410A
+
+      cl_ulong spill_mem_size;
+
+      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+                                runtime->cl.device_id,
+                                CL_KERNEL_SPILL_MEM_SIZE_INTEL,
+                                sizeof(spill_mem_size),
+                                &spill_mem_size,
+                                NULL));
+
+      fprintf(stderr,"\t\tspill mem size: %lu bytes\n",
+              (unsigned long)spill_mem_size);
+
+      cl_ulong local_mem_size;
+
+      cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+                                runtime->cl.device_id,
+                                CL_KERNEL_LOCAL_MEM_SIZE,
+                                sizeof(local_mem_size),
+                                &local_mem_size,
+                                NULL));
+
+      fprintf(stderr,"\t\tlocal mem size: %lu bytes\n",
+              (unsigned long)local_mem_size);
+#endif
+    }
+}
+
+static
+void
+skc_device_build_program(struct skc_runtime              * const runtime,
+                         struct skc_program_source const * const source,
+                         struct skc_program_kernel const * const kernels,
+                         skc_uint                          const kernel_count)
+{
+  cl_program program;
+
+  fprintf(stderr,"%-20s: ",source->name);
+
+  cl_int cl_err;
+
+#if   SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
+
+  fprintf(stderr,"Creating (SPIR-V) ... ");
+
+  program = clCreateProgramWithIL(runtime->cl.context,
+                                  source->src,
+                                  source->srclen,
+                                  &cl_err);
+
+#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
+
+  fprintf(stderr,"Creating (Binary) ... ");
+
+  cl_int status;
+  program = clCreateProgramWithBinary(runtime->cl.context,
+                                      1,
+                                      &runtime->cl.device_id,
+                                      &source->srclen,
+                                      (unsigned char const *[]){ source->src },
+                                      &status,
+                                      &cl_err);
+
+#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
+
+  fprintf(stderr,"Creating (Source) ... ");
+
+  program = clCreateProgramWithSource(runtime->cl.context,
+                                      1,
+                                      (char const *[]){ source->src },
+                                      &source->srclen,
+                                      &cl_err);
+#else
+
+#error "SKC_KERNEL_???"
+
+#endif
+
+  cl_ok(cl_err);
+
+  fprintf(stderr,"Building ... ");
+
+  // build the program
+  cl(BuildProgram(program,
+                  1,
+                  &runtime->cl.device_id,
+                  source->options, // build options are ignored by binary
+                  NULL,
+                  NULL));
+
+  fprintf(stderr,"Done\n");
+
+  // build the kernels
+  skc_device_create_kernels(runtime,kernels,kernel_count,program);
+
+  // we're done with program for now
+  // can always recover it from a kernel instance
+  cl(ReleaseProgram(program));
+}
+
+//
+// RELEASE KERNELS
+//
+
+static
+void
+skc_device_release_kernels(struct skc_device * const device)
+{
+  for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++)
+    cl(ReleaseKernel(device->kernels[ii]));
+}
+
+
+
+cl_kernel
+skc_device_acquire_kernel(struct skc_device  * const device,
+                          skc_device_kernel_id const type)
+{
+  cl_kernel kernel = device->kernels[type];
+
+  cl(RetainKernel(kernel));
+
+  return kernel;
+}
+
+//
+// INITIALIZE KERNEL ARGS
+//
+// FIXME
+//
+// pre-assign any kernel arguments that are never going to change --
+// for example, the block pool
+//
+
+//
+//
+//
+
+#define SKC_DEVICE_BUILD_PROGRAM(p) \
+  skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p))
+
+
+void
+skc_device_create(struct skc_runtime * const runtime)
+{
+  struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device));
+
+  // hang device off of runtime
+  runtime->device = device;
+
+  // hang config off of runtime
+  runtime->config = &config;
+
+  // create kernels
+  SKC_DEVICE_BUILD_PROGRAM(block_pool_init);
+  SKC_DEVICE_BUILD_PROGRAM(paths_copy);
+  SKC_DEVICE_BUILD_PROGRAM(fills_expand);
+  SKC_DEVICE_BUILD_PROGRAM(rasterize);
+  SKC_DEVICE_BUILD_PROGRAM(segment_ttrk);
+  SKC_DEVICE_BUILD_PROGRAM(rasters_alloc);
+  SKC_DEVICE_BUILD_PROGRAM(prefix);
+  SKC_DEVICE_BUILD_PROGRAM(place);
+  SKC_DEVICE_BUILD_PROGRAM(segment_ttck);
+  SKC_DEVICE_BUILD_PROGRAM(render);
+  SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
+  SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
+
+  // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up
+  hs_create(runtime->cl.context,runtime->cl.device_id,NULL);
+}
+
+void
+skc_device_dispose(struct skc_runtime * const runtime)
+{
+  //
+  // FIXME -- dispose of programs, kernels, etc.
+  //
+
+  skc_runtime_host_perm_free(runtime,runtime->device);
+}
+
+//
+// FIXME -- just pass the device type
+//
+
+void
+skc_device_enqueue_kernel(struct skc_device  * const device,
+                          skc_device_kernel_id const type,
+                          cl_command_queue           cq,
+                          cl_kernel                  kernel,
+                          size_t               const work_size,
+                          cl_uint                    num_events_in_wait_list,
+                          cl_event const     * const event_wait_list,
+                          cl_event           * const event)
+{
+  if (work_size == 0)
+    return;
+
+  cl_uint  work_dim   [1];
+  size_t   work_global[3];
+  size_t   work_local [3];
+
+  size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size,
+                                                                 work_dim,
+                                                                 work_global,
+                                                                 work_local);
+  cl(EnqueueNDRangeKernel(cq,
+                          kernel,// device->kernels[type],
+                          work_dim[0],
+                          NULL,
+                          work_global,
+                          work_local_ptr,
+                          num_events_in_wait_list,
+                          event_wait_list,
+                          event));
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h
new file mode 100644
index 0000000000..0cac2261e7
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_DEVICE_CL_12_H
+#define SKC_ONCE_DEVICE_CL_12_H
+
+//
+// FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE
+//
+// THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED
+//
+
+#include "block.h"
+
+//
+//
+//
+
+#include <hs/cl/gen9/hs_cl_macros.h>
+
+//
+// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES:
+//
+// 1) The subblock size should match the natural SIMT/SIMD width of
+//    the target device.
+//
+// 2) Either a square or rectangular (1:2) tile size is chosen.  The
+//    tile size is usually determined by the amount of SMEM available
+//    to a render kernel subgroup and desired multiprocessor
+//    occupancy.
+//
+// 3) If the tile is rectangular then the block size must be at least
+//    twice the size of the subblock size.
+//
+// 4) A large block size can decrease allocation overhead but there
+//    will be diminishing returns as the block size increases.
+//
+
+#define SKC_DEVICE_BLOCK_WORDS_LOG2             6  // CHANGE "WORDS" TO "SIZE" ?
+#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2          3
+
+#define SKC_TILE_WIDTH_LOG2                     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+#define SKC_TILE_HEIGHT_LOG2                    (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1)
+
+/////////////////////////////////////////////////////////////////
+//
+// BLOCK POOL INIT
+//
+
+#define SKC_BP_INIT_IDS_KERNEL_ATTRIBS
+#define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS      __attribute__((reqd_work_group_size(2,1,1)))
+
+/////////////////////////////////////////////////////////////////
+//
+// PATHS ALLOC
+//
+
+#define SKC_PATHS_ALLOC_KERNEL_ATTRIBS          __attribute__((reqd_work_group_size(1,1,1)))
+
+/////////////////////////////////////////////////////////////////
+//
+// PATHS COPY
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2       SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
+#define SKC_PATHS_COPY_ELEM_WORDS               1
+#define SKC_PATHS_COPY_ELEM_EXPAND()            SKC_EXPAND_1()
+
+#define SKC_PATHS_COPY_KERNEL_ATTRIBS           __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE)))
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I)              ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS)
+
+typedef skc_uint  skc_paths_copy_elem;
+typedef skc_uint  skc_pb_idx_v;
+
+/////////////////////////////////////////////////////////////////
+//
+// FILLS EXPAND
+//
+
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+#define SKC_FILLS_EXPAND_ELEM_WORDS             1
+
+#define SKC_FILLS_EXPAND_KERNEL_ATTRIBS         __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE)))
+
+/////////////////////////////////////////////////////////////////
+//
+// RASTER ALLOC
+//
+// NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is
+// why use of the subgroup broadcast produces a compiler error. So a
+// subgroup of size 16 is this widest we can require.
+//
+
+#define SKC_RASTERS_ALLOC_GROUP_SIZE            16
+
+#if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16)
+
+#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE)))
+#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_sub_group_local_id()
+#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      sub_group_scan_inclusive_add(v)
+#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        sub_group_broadcast(v,i)
+
+#else
+
+#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1)))
+#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_local_id(0)
+#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      work_group_scan_inclusive_add(v)
+#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        work_group_broadcast(v,i)
+
+#endif
+
+/////////////////////////////////////////////////////////////////
+//
+// RASTERIZE
+//
+
+#define SKC_RASTERIZE_SUBGROUP_SIZE             SKC_DEVICE_SUBBLOCK_WORDS
+#define SKC_RASTERIZE_VECTOR_SIZE_LOG2          0
+#define SKC_RASTERIZE_WORKGROUP_SUBGROUPS       1
+
+#define SKC_RASTERIZE_KERNEL_ATTRIBS                                    \
+  __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \
+  __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1)))
+
+#define SKC_RASTERIZE_FLOAT                     float
+#define SKC_RASTERIZE_UINT                      uint
+#define SKC_RASTERIZE_INT                       int
+#define SKC_RASTERIZE_PREDICATE                 bool
+#define SKC_RASTERIZE_POOL                      uint
+
+#define SKC_RASTERIZE_TILE_HASH_X_BITS          1
+#define SKC_RASTERIZE_TILE_HASH_Y_BITS          2
+
+typedef skc_block_id_t skc_block_id_v_t;
+typedef skc_uint2      skc_ttsk_v_t;
+typedef skc_uint2      skc_ttsk_s_t;
+
+// SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE);
+
+/////////////////////////////////////////////////////////////////
+//
+// PREFIX
+//
+
+#define SKC_PREFIX_SUBGROUP_SIZE               8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS
+#define SKC_PREFIX_WORKGROUP_SUBGROUPS         1
+
+#define SKC_PREFIX_KERNEL_ATTRIBS                                       \
+  __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE)))  \
+  __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1)))
+
+#define SKC_PREFIX_TTP_V                       skc_uint2
+#define SKC_PREFIX_TTS_V_BITFIELD              skc_int
+
+#define SKC_PREFIX_TTS_VECTOR_INT_EXPAND       SKC_EXPAND_1
+
+#define SKC_PREFIX_SMEM_ZERO                   ulong
+#define SKC_PREFIX_SMEM_ZERO_WIDTH             (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t))
+#define SKC_PREFIX_SMEM_COUNT_BLOCK_ID         8
+
+#define SKC_PREFIX_BLOCK_ID_V_SIZE             SKC_PREFIX_SUBGROUP_SIZE
+
+#define SKC_PREFIX_TTXK_V_SIZE                 SKC_PREFIX_SUBGROUP_SIZE
+#define SKC_PREFIX_TTXK_V_MASK                 (SKC_PREFIX_TTXK_V_SIZE - 1)
+
+typedef skc_uint       skc_bp_elem_t;
+
+typedef skc_uint2      skc_ttrk_e_t;
+typedef skc_uint2      skc_ttsk_v_t;
+typedef skc_uint2      skc_ttsk_s_t;
+typedef skc_uint2      skc_ttpk_s_t;
+typedef skc_uint2      skc_ttxk_v_t;
+
+typedef skc_int        skc_tts_v_t;
+
+typedef skc_int        skc_ttp_t;
+
+typedef skc_uint       skc_raster_yx_s;
+
+typedef skc_block_id_t skc_block_id_v_t;
+typedef skc_block_id_t skc_block_id_s_t;
+
+/////////////////////////////////////////////////////////////////
+//
+// PLACE
+//
+
+#define SKC_PLACE_SUBGROUP_SIZE                16
+#define SKC_PLACE_WORKGROUP_SUBGROUPS          1
+
+#define SKC_PLACE_KERNEL_ATTRIBS                                       \
+  __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE)))  \
+  __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1)))
+
+typedef skc_uint  skc_bp_elem_t;
+
+typedef skc_uint  skc_ttsk_lo_t;
+typedef skc_uint  skc_ttsk_hi_t;
+
+typedef skc_uint  skc_ttpk_lo_t;
+typedef skc_uint  skc_ttpk_hi_t;
+
+typedef skc_uint  skc_ttxk_lo_t;
+typedef skc_uint  skc_ttxk_hi_t;
+
+typedef skc_uint2 skc_ttck_t;
+
+typedef skc_bool  skc_pred_v_t;
+typedef skc_int   skc_int_v_t;
+
+/////////////////////////////////////////////////////////////////
+//
+// RENDER
+//
+
+#define SKC_ARCH_GEN9
+
+#if defined(__OPENCL_C_VERSION__)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define SKC_RENDER_SUBGROUP_SIZE               8
+#define SKC_RENDER_WORKGROUP_SUBGROUPS         1
+
+#define SKC_RENDER_KERNEL_ATTRIBS                                       \
+  __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE)))  \
+  __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1)))
+
+#define SKC_RENDER_SCANLINE_VECTOR_SIZE        2
+
+#define SKC_RENDER_REGS_COLOR_R                2
+#define SKC_RENDER_REGS_COVER_R                3
+
+#define SKC_RENDER_TTSB_EXPAND()               SKC_EXPAND_1()
+
+#define SKC_RENDER_TTS_V                       skc_int
+#define SKC_RENDER_TTS_V_BITFIELD              skc_int
+
+#define SKC_RENDER_TTP_V                       skc_int2
+#define SKC_RENDER_AREA_V                      skc_int2
+
+#define SKC_RENDER_TILE_COLOR_PAIR             half2
+#define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v)   vload2(x,v)
+
+#define SKC_RENDER_SURFACE_COLOR               half4
+#define SKC_RENDER_SURFACE_WRITE               write_imageh
+
+// #define SKC_RENDER_TTXB_VECTOR_INT             int2
+// #define SKC_RENDER_TTXB_VECTOR_UINT            uint2
+
+#define SKC_RENDER_WIDE_AA                     ulong // SLM = 64 bytes/clock
+
+#define SKC_RENDER_TILE_COLOR                  half2
+#define SKC_RENDER_TILE_COVER                  half2
+
+#define SKC_RENDER_ACC_COVER_INT               int2
+#define SKC_RENDER_ACC_COVER_UINT              uint2
+
+#define SKC_RENDER_GRADIENT_FLOAT              float2
+#define SKC_RENDER_GRADIENT_INT                int2
+#define SKC_RENDER_GRADIENT_STOP               int2
+#define SKC_RENDER_GRADIENT_FRAC               half2
+#define SKC_RENDER_GRADIENT_COLOR_STOP         half
+
+#define SKC_RENDER_SURFACE_U8_RGBA             uint2
+
+#define SKC_RENDER_TILE_COLOR_VECTOR           uint16
+#define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint
+#define SKC_RENDER_TILE_COLOR_VECTOR_COUNT     ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR))
+
+/////////////////////////////////////////////////////////////////
+//
+// PATHS & RASTERS RECLAIM
+//
+// FIXME -- investigate enabling the stride option for a smaller grid
+// that iterates over a fixed number of threads.  Since reclamation is
+// a low-priority task, it's probably reasonable to trade longer
+// reclamation times for lower occupancy of the device because it
+// might delay the fastpath of the pipeline.
+//
+
+#define SKC_RECLAIM_ARRAY_SIZE                  (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice
+
+/////////////////////////////////////////////////////////////////
+//
+// PATHS RECLAIM
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2    SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
+#define SKC_PATHS_RECLAIM_LOCAL_ELEMS           1
+#define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE)))
+
+/////////////////////////////////////////////////////////////////
+//
+// RASTERS RECLAIM
+//
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2  SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
+#define SKC_RASTERS_RECLAIM_LOCAL_ELEMS         1
+#define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS      __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)))
+
+//
+// COMMON -- FIXME -- HOIST THESE ELSEWHERE
+//
+
+#define SKC_DEVICE_BLOCK_WORDS                 (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
+#define SKC_DEVICE_SUBBLOCK_WORDS              (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_DEVICE_BLOCK_DWORDS                (SKC_DEVICE_BLOCK_WORDS / 2)
+
+#define SKC_DEVICE_BLOCK_WORDS_MASK            SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2)
+#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK    SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)
+
+#define SKC_TILE_RATIO                         (SKC_TILE_HEIGHT / SKC_TILE_WIDTH)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE           (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2)
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE        (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE      (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2)
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE         (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2)
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat
new file mode 100644
index 0000000000..3631271d9b
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_all.bat
@@ -0,0 +1,15 @@
+@ECHO OFF
+
+CMD /C make_inl_cl.bat  ..\..\..\block_pool_init.cl
+CMD /C make_inl_cl.bat  ..\..\..\fills_expand.cl
+CMD /C make_inl_cl.bat  ..\..\..\paths_copy.cl
+CMD /C make_inl_cl.bat  ..\..\..\rasterize.cl
+CMD /C make_inl_cl.bat  ..\..\..\segment_ttrk.cl
+CMD /C make_inl_cl.bat  ..\..\..\rasters_alloc.cl
+CMD /C make_inl_cl.bat  ..\..\..\prefix.cl
+CMD /C make_inl_cl.bat  ..\..\..\place.cl
+CMD /C make_inl_cl.bat  ..\..\..\segment_ttck.cl
+CMD /C make_inl_cl.bat  ..\..\..\render.cl
+CMD /C make_inl_cl.bat  ..\..\..\paths_reclaim.cl
+CMD /C make_inl_cl.bat  ..\..\..\rasters_reclaim.cl
+
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat
new file mode 100644
index 0000000000..e3b0b37651
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/inl/make_inl_cl.bat
@@ -0,0 +1,85 @@
+@ECHO OFF
+
+::
+:: TARGET OPENCL 1.2
+:: 
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: OPENCL_STD=-cl-std=CL2.0
+:: OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+REM SET PRE_DIR=%~p1
+REM CD %PRE_DIR%
+
+SET PRE_CL=%~n1
+SET PRE_CL=%PRE_CL%.pre.cl
+
+SET PRE_SRC_INL=%~n1
+SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
+
+SET PRE_BIN_IR=%~n1
+SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
+
+SET PRE_BIN_INL=%~n1
+SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
+
+::
+::
+::
+
+SET DIR_CL12="%INTELOCLSDKROOT%include"
+SET DIR_COMPUTE=..\..\..\..\..\..\..
+SET DIR_SKC=%DIR_COMPUTE%\skc
+SET DIR_PLATFORM=%DIR_SKC%\platforms\cl_12
+SET DIR_DEVICE=..
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C cl -I %DIR_CL12% -I %DIR_DEVICE% -I %DIR_PLATFORM% -I %DIR_SKC% -I %DIR_COMPUTE% -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C clang-format -style=Mozilla -i %PRE_CL%
+CMD /C dos2unix -q %PRE_CL%
+CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
+
+echo %PRE_CL%
+echo %PRE_SRC_INL%
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C touch %PRE_BIN_IR%
+ECHO ON
+@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
+@ECHO OFF
+CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
+
+echo %PRE_BIN_IR%
+echo %PRE_BIN_INL%
+
+
diff --git a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
new file mode 100644
index 0000000000..39fee75f3d
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
+#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_FILLS_EXPAND_X == 1 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_FILLS_EXPAND_X == 2 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_FILLS_EXPAND_X == 4 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_FILLS_EXPAND_X == 8 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_FILLS_EXPAND_X == 16)
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_FILLS_EXPAND_X"
+#endif
+
+//
+// Fill and rasterize cmds only differ in their first word semantics
+//
+
+union skc_cmd_expand
+{
+  union skc_cmd_fill      fill;
+  union skc_cmd_rasterize rasterize;
+};
+
+//
+//
+//
+
+union skc_path_elem
+{
+  skc_uint  u32;
+  skc_float f32;
+};
+
+//
+// COMPILE-TIME AND RUN-TIME MACROS
+//
+
+#define SKC_ELEM_IN_RANGE(X,I)                                          \
+  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_ELEM_GTE(X,I)                                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
+  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+//
+//
+
+void
+skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
+                    skc_uint                         * const out_idx,
+                    union skc_cmd_expand             * const cmd,
+                    union skc_path_elem                const e,
+                    skc_uint                           const e_idx)
+{
+  //
+  // FIXME -- we can append a large number of nodeword indices to a
+  // local SMEM queue and flush when full.  It may or may not be a
+  // performance win on some architectures.
+  //
+  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
+  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
+
+  cmd->rasterize.nodeword = e_idx;
+
+  if (is_elem) {
+    cmds_out[*out_idx + offset] = cmd->rasterize;
+  }
+
+  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
+}
+
+//
+//
+//
+
+__kernel
+SKC_FILLS_EXPAND_KERNEL_ATTRIBS
+void
+skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
+                        __global skc_uint                volatile * const atomics,
+                        __global skc_block_id_t          const    * const map,
+                        __global union skc_cmd_fill      const    * const cmds_in,
+                        __global union skc_cmd_rasterize          * const cmds_out)
+{
+  //
+  // Need to harmonize the way we determine a subgroup's id.  In this
+  // kernel it's not as important because no local memory is being
+  // used.  Although the device/mask calc to determine subgroup and
+  // lanes is still proper, we might want to make it clearer that
+  // we're working with subgroups by using the subgroup API.
+  //
+  // every subgroup/simd that will work on the block loads the same command
+  //
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const       cmd_stride = get_num_sub_groups();
+#else
+  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();
+
+  // load fill command -- we reuse y component
+  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };
+
+  // get the path header block from the map
+  skc_block_id_t       id         = map[cmd.fill.path];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("expand[%u] = %u\n",cmd_idx,id);
+#endif
+
+  //
+  // blindly load all of the head elements into registers
+  //
+  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // pick out count.nodes and count.prims from the header
+  //
+  skc_uint count_nodes, count_prims;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
+    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
+  }                                                                     \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
+    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // debug of path head
+  //
+#if 0
+  skc_uint count_blocks;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
+    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  if (get_sub_group_local_id() == 0)
+    printf("path header = { %5u, %5u, %5u }\n",
+           count_blocks,count_nodes,count_prims);
+#endif
+
+  //
+  // acquire slots in the expanded cmd extent
+  //
+  // decrement prim_idx by 1 so we can use inclusive warp scan later
+  //
+  skc_uint out_idx = 0;
+
+  if (get_sub_group_local_id() == 0) {
+    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
+      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
+  }
+
+  out_idx = sub_group_broadcast(out_idx,0);
+
+  //
+  // process ids trailing the path header
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
+    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
+      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
+        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
+      }                                                                 \
+    }                                                                   \
+    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
+                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // we're done if it was just the header
+  //
+  if (count_nodes == 0)
+    return;
+
+  //
+  // otherwise, process the nodes
+  //
+
+  //
+  // get id of next node
+  //
+  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+
+  //
+  // the following blocks are nodes
+  //
+  while (true)
+    {
+      // get index of each element
+      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+      //
+      // blindly load all of the node elements into registers
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      //
+      // append all valid ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
+                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      // any more nodes?
+      if (--count_nodes == 0)
+        return;
+
+      //
+      // get id of next node
+      //
+      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
new file mode 100644
index 0000000000..302ea14af2
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
@@ -0,0 +1,543 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "path.h"
+#include "block_pool_cl.h"
+#include "path_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#if 0
+
+//
+// SIMD AVX2
+//
+
+#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
+#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
+#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
+
+typedef skc_uint8  skc_paths_copy_elem;
+typedef skc_uint8  skc_pb_idx_v;
+
+#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
+
+#endif
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
+#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
+
+// FIXME -- use SUBGROUP terminology everywhere
+#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+//
+// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
+//
+
+#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
+
+#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
+
+//
+//
+//
+
+skc_uint
+skc_sub_group_local_id()
+{
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  return get_sub_group_local_id();
+#else
+  return 0;
+#endif
+}
+
+//
+// convert an atomic read counter offset to a block id
+//
+
+skc_block_id_t
+skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
+                 skc_uint                        const bp_idx_mask,
+                 skc_uint                        const bp_reads,
+                 skc_uint                        const bp_off)
+{
+  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
+
+  return bp_ids[bp_idx];
+}
+
+//
+//
+//
+
+void
+skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx)
+{
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
+    }
+
+#if 0
+  //
+  // NOTE THIS IS PRINTING 8 ROWS
+  //
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+#endif
+}
+
+//
+//
+//
+
+void
+skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // remap block id tags bp_elems the host-side rolling counter pb_elems a
+  // device-side block pool id
+  //
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
+
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // only convert if original elem is not invalid
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
+      
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+//
+//
+
+void
+skc_host_map_update(__global skc_uint * const host_map,
+                    skc_uint            const block,
+                    skc_paths_copy_elem const elem)
+{
+  //
+  // write first elem to map -- FIXME -- this is a little nasty
+  // because it relies on the the host handle always being the first
+  // word in the path header.
+  //
+  // OTOH, this is not unreasonable.  The alternative is to have a
+  // separate kernel initializing the map.
+  //
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
+#endif
+    {
+#if SKC_PATHS_COPY_ELEM_WORDS == 1
+      host_map[elem] = block; 
+#if 0
+      printf("[%u] = %u\n",elem,block);
+#endif
+#else
+      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
+#endif
+    }
+}
+
+//
+//
+//
+
+void
+skc_copy_head(__global skc_uint                  * const host_map,
+              skc_uint                             const block,
+              __global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // if there are more path header words than there are
+  // threads-per-block then we can just copy the initial header words
+  //
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
+
+      (bp_elems+bp_elems_idx)[ii] = elem;
+
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+    }
+#endif
+
+  //
+  // this is similar to copy node but the first H words of the path
+  // header are not modified and simply copied
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+#endif
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // FIXME -- MIX MIX MIX MIX / SELECT
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
+        skc_block_id_t const b = bp_ids[bp_idx C];                      \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
+      }
+
+      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+
+  //
+  // the remaining words are treated like a node
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
+
+      // calculate ahead of time
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+// FIXME -- pack some of these constant integer args in a vec or struct
+//
+
+__kernel
+SKC_PATHS_COPY_KERNEL_ATTRIBS
+void
+skc_kernel_paths_copy
+(__global skc_uint                        * const host_map,
+
+ __global skc_block_id_t            const * const bp_ids,
+ __global skc_paths_copy_elem             * const bp_elems,
+ skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
+
+ __global skc_uint                  const * const bp_alloc,    // block pool ring base
+ skc_uint                                   const bp_alloc_idx,// which subbuf
+
+ __global union skc_tagged_block_id const * const pb_cmds,
+ __global skc_paths_copy_elem       const * const pb_elems,
+
+ skc_uint                                   const pb_size,     // # of commands/blocks in buffer
+ skc_uint                                   const pb_rolling,  // shifted rolling counter base
+
+ skc_uint                                   const pb_prev_from,
+ skc_uint                                   const pb_prev_span,
+ skc_uint                                   const pb_curr_from)
+{
+  //
+  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
+  //
+  // - HEAD
+  // - NODE
+  // - SEGS
+  //
+  // THESE ARE SUBGROUP ORIENTED KERNELS
+  //
+  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
+  //
+
+  //
+  // It's likely that peak bandwidth is achievable with a single
+  // workgroup.
+  //
+  // So let's keep the grids modestly sized and for simplicity and
+  // portability, let's assume that a single workgroup can perform all
+  // steps in the copy.
+  //
+  // Launch as large of a workgroup as possiblex
+  //
+  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
+  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
+  // 3. FOR EACH COMMAND:
+  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
+  //      - NODE: CONVERT AND COPY B INDICES
+  //      - SEGS: BULK COPY
+  //
+  // B : number of words in block -- always pow2
+  // W : intelligently/arbitrarily chosen factor of B -- always pow2
+  //
+
+  //
+  // There are several approaches to processing the commands:
+  //
+  // 1. B threads are responsible for one block. All threads broadcast
+  //    load a single command word. Workgroup size must be a facpb_elemsr of
+  //    B.
+  //
+  // 2. W threads process an entire block. W will typically be the
+  //    device's subgroup/warp/wave width. W threads broadcast load a
+  //    single command word.
+  //
+  // 3. W threads process W blocks. W threads load W command words and
+  //    process W blocks.
+  //
+  // Clearly (1) has low I/O intensity but will achieve high
+  // parallelism by activating the most possible threads. The downside
+  // of this kind of approach is that the kernel will occupy even a
+  // large GPU with low intensity work and reduce opportunities for
+  // concurrent kernel execution (of other kernels).
+  //
+  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
+  //
+  // Note that there are many other approaches.  For example, similar
+  // pb_elems (1) but each thread loads a pow2 vector of block data.
+  //
+
+  // load the copied atomic read "base" from gmem
+  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
+  // will always be less than 2^32
+  skc_uint const gid      = get_global_id(0);
+  // every subgroup/simd that will work on the block loads the same command
+  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
+  // path builder data can be spread across two spans
+  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
+
+  // no need pb_elems make this branchless
+  if (pb_idx >= pb_size)
+    pb_idx -= pb_size;
+
+  // broadcast load the command
+  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
+
+  // what do we want pb_elems do with this block?
+  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
+
+  // compute offset from rolling base to get index into block pool ring allocation
+  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
+
+  // convert the pb_cmd's offset counter pb_elems a block id
+  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
+
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
+    printf("< %8u >\n",block);
+  }
+#endif
+
+  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
+  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
+
+  // calculate bp_elems (to) / pb_elems (from)
+  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
+  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
+
+  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, segs\n",bp_off);
+#endif
+      skc_copy_segs(bp_elems,
+                    bp_elems_idx,
+                    pb_elems,
+                    pb_elems_idx);
+    }
+  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, NODE\n",bp_off);
+#endif
+      skc_copy_node(bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, HEAD\n",bp_off);
+#endif
+      skc_copy_head(host_map,
+                    block,
+                    bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_PATHS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
+                       __global skc_uint          * const bp_alloc,
+                       skc_uint                     const bp_alloc_idx,
+                       skc_uint                     const pb_cmd_count)
+{
+  //
+  // allocate blocks in block pool
+  //
+  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
+
+  // store in slot
+  bp_alloc[bp_alloc_idx] = reads;
+
+#if 0
+  printf("pc: %8u + %u\n",reads,pb_cmd_count);
+#endif
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
new file mode 100644
index 0000000000..2aee5dac17
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
@@ -0,0 +1,390 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// FIXME -- a pre-allocation step could load the path header quads and
+// total up the number of blocks in the workgroup or subgroup
+// minimizing the number of later atomics adds.
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
+
+//
+//
+//
+
+#if   ( SKC_PATHS_RECLAIM_X == 1 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_PATHS_RECLAIM_X == 2 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_PATHS_RECLAIM_X == 4 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_PATHS_RECLAIM_X == 8 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_PATHS_RECLAIM_X == 16)
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_PATHS_RECLAIM_X"
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
+  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
+  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
+
+#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
+  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
+  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
+  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                         __global skc_uint                * const bp_elems,    // block pool blocks
+                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t const    * const map,         // path host-to-device map
+                         struct   skc_reclaim               const reclaim)     // array of host path ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host path id
+      skc_path_h const path = reclaim.aN[reclaim_idx];
+
+      // get the path header block from the map
+      skc_block_id_t   id   = map[path];
+
+      //
+      // blindly load all of the head elements into registers
+      //
+      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      skc_uint count_blocks, count_nodes;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+      }                                                                 \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
+      }
+#endif
+
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      // FIXME count_blocks and atomic add can be done in same lane
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+
+#if 0
+        printf("paths: bp_ids_base = %u\n",bp_ids_base);
+#endif
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // shift away the tagged block id's tag
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+        }
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as invalid tagged block ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
+          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
+            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,h##I);
+      }
+
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
+
+        // get index of each element
+        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+        //
+        // blindly load all of the node elements into registers
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // shift away the tagged block id's tag
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+          }
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,n##I);
+
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl
new file mode 100644
index 0000000000..92fa0a243d
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/place.cl
@@ -0,0 +1,871 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
+#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
+
+//
+//
+//
+
+#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_PLACE_X == 1 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
+#define SKC_PLACE_EXPAND_I_LAST      0
+
+#elif ( SKC_PLACE_X == 2 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
+#define SKC_PLACE_EXPAND_I_LAST      1
+
+#elif ( SKC_PLACE_X == 4 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
+#define SKC_PLACE_EXPAND_I_LAST      3
+
+#elif ( SKC_PLACE_X == 8 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
+#define SKC_PLACE_EXPAND_I_LAST      7
+
+#elif ( SKC_PLACE_X == 16)
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
+#define SKC_PLACE_EXPAND_I_LAST      15
+#endif
+
+//
+// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
+// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
+//
+// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
+// KERNELS USE DIFFERENT SUBGROUP SIZES.
+//
+// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
+// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
+//
+// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
+// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
+// ONLY SUPPORT A SUBGROUP SIZE OF 16.
+//
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
+#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
+
+#endif
+
+//
+// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
+// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
+//
+
+#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
+
+#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+
+
+//
+// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
+//
+#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
+
+//
+// TTSK v2:
+//
+//  0                                       63
+//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
+//  +---------+--------+---------+-----+-----+
+//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
+//
+//
+// TTPK v2:
+//
+//  0                                    63
+//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
+//  +---------+--------+------+-----+-----+
+//  |    27   | 1 (=1) |  12  | 12  | 12  |
+//
+//
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+union skc_subgroup_smem
+{
+  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
+
+  struct {
+    struct {
+      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } lo;
+
+    struct {
+      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } hi;
+
+    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
+  };
+
+};
+
+//
+// scatter scan max
+//
+static
+skc_int_v_t
+skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
+                     skc_int_v_t                                 const iss,
+                     skc_int_v_t                                 const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
+  skc_int_v_t  const scratch_idx      = max(ess,0);
+
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->scratch[get_sub_group_local_id()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store) {
+    smem->scratch[scratch_idx] = get_sub_group_local_id();
+  }
+
+  //
+  // propagate lanes to right using max scan
+  //
+  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
+  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
+
+  return source;
+}
+
+//
+//
+//
+
+static
+skc_bool
+skc_xk_clip(union skc_tile_clip const * const tile_clip,
+            skc_ttxk_t                * const xk)
+{
+  //
+  // clip the sk and pk keys
+  //
+  // if fully clipped then return false
+  //
+  // alternatively -- we can expand all these keys in place
+  //
+  // alternatively -- keep sk and pk keys segregated because sk
+  // represents the vast majority of keys and are easier to process.
+  // don't mess with the fastpath!
+  //
+  return false;
+}
+
+//
+//
+//
+
+static
+skc_ttck_t
+skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const sk_idx)
+{
+  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
+  skc_uint const hi = smem->hi.sk[sk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+static
+skc_ttck_t
+skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const pk_idx,
+             skc_uint                                    const dx)
+{
+  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
+  skc_uint const hi = smem->hi.pk[pk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            sk)
+{
+  //
+  // Pretty sure you can never ever have an sk count equal to 0
+  //
+  skc_uint ck_base = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (get_sub_group_local_id() == 0) {
+    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
+  }
+
+  // broadcast base to all lanes
+  ck_base = sub_group_broadcast(ck_base,0);
+
+  // convert sk keys to ck keys
+  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
+    {
+      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
+    }
+}
+
+//
+//
+//
+
+static
+skc_int
+skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
+                  skc_uint                                    const idx)
+{
+  skc_uint const lo      = smem->lo.pk[idx];
+  skc_uint const hi      = smem->hi.pk[idx];
+
+  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
+  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
+
+  return (span_lo | span_hi) + 1;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            pk)
+{
+  // bail out if pk queue is empty
+  if (pk == 0)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("%u\n",pk);
+#endif
+
+  //
+  // FIXME -- this nested loop iterates over the queue processing a
+  // subgroup of 64-bit keys at a time.  This is probably not the most
+  // efficient approach so investigate how to store and iterate over a
+  // wider than subgroup (node-sized) queue of keys.
+  //
+
+  // round up so we work with full subgroups
+  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
+  skc_uint       ii    = 0;
+
+  // nested loop that expands all ttpk keys
+#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
+  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
+#endif
+    {
+      skc_uint idx  = ii + get_sub_group_local_id();
+      skc_int  span = 0;
+
+      // how many tiles does this ttpk span?
+      if (idx < pk)
+        span = skc_ttpk_get_span(smem,idx);
+
+      // we need inclusive, exclusive and total
+      skc_int iss = sub_group_scan_inclusive_add(span);
+      skc_int ess = iss - span;
+      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
+
+      // printf("%u : %u\n",span,iss);
+      // continue;
+
+      // atomically allocate space for the pk keys
+      skc_uint ck_base = 0;
+
+      // last lane performs the block pool allocation with an atomic increment
+      if (get_sub_group_local_id() == 0) {
+        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
+      }
+
+      // broadcast atomically allocated extent base to all lanes
+      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
+
+      //
+      // FIXME -- this loop would probably be faster if the ttpk keys
+      // were held in registers and accessed with shuffles instead of
+      // SMEM loads
+      //
+
+      //
+      // loop until there are no more expanded pk keys
+      //
+      while (true)
+        {
+          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
+          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
+
+          // store valid ck keys to gmem
+          if (get_sub_group_local_id() < rem) {
+            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
+          }
+
+          // decrement remainder
+          rem -= SKC_PLACE_SUBGROUP_SIZE;
+
+          if (rem <= 0)
+            break;
+
+          // increment/decrement indices
+          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
+          iss    -= SKC_PLACE_SUBGROUP_SIZE;
+          ess    -= SKC_PLACE_SUBGROUP_SIZE;
+        }
+    }
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_ballot(skc_uint * const xk, skc_uint const is_xk)
+{
+#if 0
+  //
+  // FIXME -- when available, this should use the idiom:
+  //
+  //   ballot() + lane_mask_less_than_or_equal + popcount()
+  //
+  // Supported by:
+  //
+  //   - Vulkan 1.1 / SPIR-V 1.3
+  //   - CUDA
+  //   - AVX2 (SSE*?)
+  //
+#else
+  //
+  // otherwise, emulate with an inclusive scan (yuk)
+  //
+  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
+
+  skc_uint const xk_idx = *xk + prefix - is_xk;
+
+  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
+
+#if 0
+  printf("< %3u >\n",xk_idx);
+#endif
+
+  return xk_idx;
+#endif
+}
+
+//
+//
+//
+__kernel
+SKC_PLACE_KERNEL_ATTRIBS
+void
+skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
+                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
+                 __global skc_ttck_t                   * const ck_extent,
+                 __global union skc_cmd_place const    * const cmds,
+                 __global skc_block_id_t               * const map,
+                 skc_uint4                               const clip,
+                 skc_uint                                const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem  volatile                smem[1];
+#else
+  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+  //
+  // This is a subgroup-centric kernel
+  //
+  // Which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+  // Test the raster's translated bounds against the composition's
+  // tile clip
+  //
+  // There are 3 cases:
+  //
+  //   - the raster is completely clipped -> return
+  //   - the raster is partially  clipped -> all keys must clipped
+  //   - the raster is not        clipped -> no keys are tested
+  //
+  //
+  // There are at least 4 implementations of place and we want to
+  // special-case them as much as possible so that, at the least, the
+  // fastpath remains fast.
+  //
+  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
+  //
+  //  - implement CLIPPED + NO TRANSLATION path
+  //
+  //  - implement NO CLIP +    TRANSLATION path
+  //
+  //  - implement CLIPPED +    TRANSLATION path
+  //
+  //
+  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
+  // 12:12:8 integer where:
+  //
+  //  12: ttsk
+  //  12: ttpk
+  //   8: /dev/null -- clipped or invalid key
+  //
+  // Three kinds of nodes in a raster's list:
+  //
+  //  - the head node
+  //  - an internal node
+  //  - the final node
+  //
+
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const cmd_idx = get_group_id(0);
+#else
+  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  // load command
+  union skc_cmd_place const cmd = cmds[cmd_idx];
+
+  // get the raster header from the raster host id -- scalar
+  skc_block_id_t            id  = map[cmd.raster_h];
+
+  //
+  // load all of the head block ttxk keys into registers
+  //
+  // FIXME -- this pattern lends itself to using the higher
+  // performance Intel GEN block load instructions
+  //
+  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  union skc_raster_node_elem const h##I = {                     \
+    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
+               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
+  };
+
+  SKC_PLACE_EXPAND();
+
+  //
+  // load raster header counts -- we only need the "nodes" and "keys"
+  // words but the keys we loaded are doublewords.
+  //
+  // FIXME -- this can be made portable with compile-time macro expansion
+  //
+  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+  //
+  //
+  //
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
+         nodes,keys,                                            \
+         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
+         h##I.u32v2.hi,h##I.u32v2.lo,                           \
+         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+  SKC_PLACE_EXPAND();
+#endif
+
+  //
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("place: %u / %u / %u\n",head_id,nodes,keys);
+  }
+#endif
+
+  {
+    //
+    // classify every key in the header
+    //
+    // keys: 0 is not a key / 1 is a key
+    // skpk: 0 is sk        / 1 is pk
+    //
+    skc_uint bits_keys = 0;
+    skc_uint bits_skpk = 0;
+
+    //
+    // calculate bits_keys
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
+      if (idx < keys) {                                                 \
+        bits_keys |= (1u << I);                                         \
+      }                                                                 \
+      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
+        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
+          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
+            bits_keys &= ~(1u << I);                                    \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // blindly calculate bits_skpk
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+    //
+    // next pointer is last element of last row.  save it now because
+    // this might be recognized as a subgroup-uniform/scalar.
+    //
+    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+    //
+    // append SK keys first
+    //
+    skc_uint const bits_sk = bits_keys & ~bits_skpk;
+    skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_sk  = (bits_sk >> I) & 1;     \
+      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
+      if (is_sk) {                              \
+        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
+        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // append PK keys next
+    //
+    skc_uint const bits_pk = bits_keys & bits_skpk;
+    skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_pk  = (bits_pk >> I) & 1;     \
+      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
+      if (is_pk) {                              \
+        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
+        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+    //
+    // flush the keys
+    //
+    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+  }
+
+  //
+  // we're done if there was only a head node
+  //
+  if (nodes == 0)
+    return;
+
+  //
+  // decrement keys
+  //
+  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
+
+  //
+  // otherwise, append keys in trailing nodes to smem
+  //
+  while (true)
+    {
+      //
+      // load all of the node block ttxk keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_raster_node_elem const n##I = {                         \
+        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
+                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
+      };
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
+             nodes,keys,                                                \
+             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
+             n##I.u32v2.hi,n##I.u32v2.lo,                               \
+             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+      SKC_PLACE_EXPAND();
+#endif
+
+      //
+      // classify every key in the header
+      //
+      // keys: 0 is not a key / 1 is a key
+      // skpk: 0 is sk        / 1 is pk
+      //
+      skc_uint bits_keys = 0;
+      skc_uint bits_skpk = 0;
+
+      //
+      // calculate bits_keys
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
+        if (idx < keys) {                                               \
+          bits_keys |= (1u << I);                                       \
+        }                                                               \
+        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
+          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
+            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
+              bits_keys &= ~(1u << I);                                  \
+            }                                                           \
+          }                                                             \
+        }                                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // blindly calculate bits_skpk
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+      printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+      //
+      // next pointer is last element of last row.  save it now because
+      // this might be recognized as a subgroup-uniform/scalar.
+      //
+      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+      //
+      // append SK keys first
+      //
+      skc_uint const bits_sk = bits_keys & ~bits_skpk;
+      skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_sk  = (bits_sk >> I) & 1;           \
+        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
+        if (is_sk) {                                    \
+          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
+          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // append PK keys next
+      //
+      skc_uint const bits_pk = bits_keys & bits_skpk;
+      skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_pk  = (bits_pk >> I) & 1;           \
+        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
+        if (is_pk) {                                    \
+          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
+          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+      //
+      // if total for either the sk or pk queue reaches the
+      // highwater mark then flush it to the extent
+      //
+      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+
+      //
+      // if this was the last node then we're done
+      //
+      if (--nodes == 0)
+        return;
+
+      //
+      // otherwise decrement keys
+      //
+      keys -= SKC_RASTER_NODE_COUNT_KEYS;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/prefix.cl b/src/compute/skc/platforms/cl_12/kernels/prefix.cl
new file mode 100644
index 0000000000..21a51694da
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl
@@ -0,0 +1,1041 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+// INPUT:
+//
+//   TTRK (64-BIT COMPARE)
+//
+//    0                                  63
+//    | TTSB ID |   X  |   Y  | COHORT ID |
+//    +---------+------+------+-----------+
+//    |    27   |  12  |  12  |     13    |
+//
+//
+//   TTRK (32-BIT COMPARE)
+//
+//    0                                        63
+//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
+//    +---------+-----+------+------+-----------+
+//    |    27   |  5  |  12  |  12  |     8     |
+//
+//
+// OUTPUT:
+//
+//   TTSK v2:
+//
+//    0                                     63
+//    | TTSB ID | PREFIX |  N/A |  X |  Y |
+//    +---------+--------+------+----+----+
+//    |    27   | 1 (=0) |  12  | 12 | 12 |
+//
+//
+//   TTPK v1:
+//
+//    0                                        63
+//    | TTPB ID | ALL ZEROES | SPAN |  X  |  Y  |
+//    +---------+------------+------+-----+-----+
+//    |    27   |      1     |  12  | 12  | 12  |
+//
+//
+//   TTPK v2:
+//
+//    0                                       63
+//    | TTPB ID | PREFIX | SPAN |  X  |  Y  |
+//    +---------+--------+------+-----+-----+
+//    |    27   | 1 (=1) |  12  | 12  | 12  |
+//
+
+#define SKC_PREFIX_SUBGROUP_MASK  (SKC_PREFIX_SUBGROUP_SIZE - 1)
+
+//
+// smem accumulator
+//
+
+union skc_subgroup_accum
+{
+  struct {
+    SKC_ATOMIC_INT        ttp[SKC_TILE_HEIGHT];
+  } atomic;
+
+  struct {
+    skc_ttp_t             ttp[SKC_TILE_HEIGHT];
+  } aN;
+
+  struct {
+    SKC_PREFIX_TTP_V      ttp[SKC_PREFIX_SUBGROUP_SIZE];
+  } vN;
+
+  struct {
+    SKC_PREFIX_SMEM_ZERO  ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
+  } zero;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+  // prefix accumulator
+  union skc_subgroup_accum accum;
+};
+
+//
+//
+//
+
+static
+skc_uint
+skc_subgroup_lane()
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  return get_sub_group_local_id();
+#else
+  return 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_dy(skc_tts_v_t const ttsv)
+{
+  // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
+  SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
+
+  return dy - (~ttsv >> 31);
+}
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_py(skc_tts_v_t const ttsv)
+{
+  return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
+{
+  // get "altitude"
+  SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
+
+  // get the y pixel coordinate
+  SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
+
+  //
+  // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
+  //
+  // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
+  //
+
+#if 0
+  if (tts_v != SKC_TTS_INVALID)
+    printf("< %08X = %u : %d >\n",tts_v,py,dy); 
+#endif
+
+  //
+  // scatter-add the "altitude" to accumulator
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  if (tts_v C != SKC_TTS_INVALID) {                                     \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
+  }
+
+#else
+  //
+  // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
+  //
+  // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                 \
+  if (tts_v C == SKC_TTS_INVALID)               \
+    return;                                     \
+  smem->accum.aN.ttp[py C] = dy C;
+#endif
+
+  SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
+}
+
+//
+// The implication here is that if our device configuration has a
+// rectangular 1:2 tile then we need a block size of at least 2
+// subblocks. The subblock size of course needs to match the length of
+// the smallest tile side.
+//
+
+static
+void
+skc_accum_flush(__local struct skc_subgroup_smem * const smem,
+                __global skc_bp_elem_t           * const bp_elems,
+                skc_block_id_t                     const pb_id)
+{
+  // load the ttp elements
+  SKC_PREFIX_TTP_V const ttp_v  = smem->accum.vN.ttp[get_sub_group_local_id()];
+  skc_uint         const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+  
+#if   ( SKC_TILE_RATIO == 1 )
+
+  bp_elems[offset] = ttp_v;
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+  vstore2(ttp_v,offset,bp_elems);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_reset(__local struct skc_subgroup_smem * const smem)
+{
+  for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
+    smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// get next sk key
+//
+
+static
+skc_ttsk_s_t
+skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
+                    skc_uint     * const sk_next,
+                    skc_int      * const rkpk_rem)
+{
+  // decrement count
+  *rkpk_rem -= 1;
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT with subgroup support is easy
+  //
+  // SIMT without subgroup support can always emulate with smem
+  //
+#if 0
+  //
+  // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
+  // broadcast a uint2 cast to a long. It was probably bad to do this
+  // anyway without a union wrapping the TTSK scalar type.
+  //
+  // Consider creating a union { ulong; uint2 } at a later date --
+  // probably no need to ever do this unless it makes broadcast faster
+  // which is unlikely since it will probably be implemented as 2
+  // 32-bit broadcasts.
+  //
+  // Additionally, the TTRK and TTXK key bitfield sizes are probably
+  // cast in stone and we aren't going to change them no matter
+  // architecture we're on.
+  //
+  skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
+#else
+  skc_ttsk_s_t sk_s;
+
+  sk_s.lo   = sub_group_broadcast(sk_v->lo,*sk_next);
+  sk_s.hi   = sub_group_broadcast(sk_v->hi,*sk_next);
+  *sk_next += 1;
+#endif
+
+#else
+  //
+  // SIMD will always grab component .s0 and then rotate the vector
+  //
+  sk_s = ( sk_v->s0 );
+
+  skc_ttsk_v_rotate_down(sk_v);
+
+#endif
+
+  return sk_s;
+}
+
+//
+//
+//
+
+static
+skc_raster_yx_s
+skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT with subgroup support is easy
+  //
+  // SIMT without subgroup support can always emulate with smem
+  //
+  skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
+
+#else
+  //
+  // SIMD will always grab component .s0 and then rotate the vector
+  //
+  skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
+
+#endif
+
+  return yx_s;
+}
+
+//
+// mask off ttsb id
+//
+
+static
+skc_block_id_s_t
+skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
+{
+  return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
+}
+
+//
+// load tts_v as early as possible
+//
+
+static
+skc_tts_v_t
+skc_load_tts(__global skc_bp_elem_t * const bp_elems,
+             skc_block_id_s_t         const sb_id)
+{
+  return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
+}
+
+//
+// massage ttrk keys into ttsk keys
+//
+
+static
+void
+skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
+{
+  sk_v->lo = sk_v->lo  & SKC_TTXK_LO_MASK_ID;     // clear high (N/A) bits
+  sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
+}
+
+//
+// replenish ttsk keys
+//
+
+static
+void
+skc_ttsk_v_replenish(skc_ttsk_v_t                * const sk_v,
+                     skc_uint                    * const sk_next,
+                     skc_uint                    * const rks_next,
+                     __global skc_ttrk_e_t const * const rks)
+{
+  // if there are still keys available then return
+  if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
+    return;
+
+  //
+  // otherwise, replenish sk_v
+  //
+  // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
+  // divisible by TTXK_V_SIZE and therefore loading some keys from the
+  // next raster is OK.
+  //
+  *sk_next   = 0;
+  *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
+  *sk_v      = rks[*rks_next];
+
+#if 0
+  printf("* %08X ( %3u, %3u )\n",
+         sk_v->hi,
+         (sk_v->hi >> 12) & 0xFFF,
+         (sk_v->hi      ) & 0xFFF);
+#endif
+  
+  skc_ttrk_to_ttsk(sk_v);
+
+#if 0
+  printf("! %08X ( %3u, %3u )\n",
+         sk_v->hi,
+         (sk_v->hi >> 20) & 0xFFF,
+         (sk_v->hi >>  8) & 0xFFF);
+#endif
+}
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                     __global skc_block_id_t const * const bp_ids)
+
+{
+  *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
+  *blocks      = bp_ids[*blocks_idx & bp_mask];
+  *blocks_next = 0;
+
+#if 0
+  printf("replenish blocks: %u\n",*blocks);
+#endif
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint                      * const blocks_next,
+                    skc_uint                      * const blocks_idx,
+                    skc_block_id_v_t              * const blocks,
+                    skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                    __global skc_block_id_t const * const bp_ids)
+{
+  // replenish?
+  if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
+    {
+      skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+    }
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+  //
+  // SIMD
+  //
+  skc_block_id_t id = blocks->s0;
+
+  skc_shuffle_down_1(*blocks);
+
+#endif
+
+  *blocks_next += 1;
+
+  return id;
+}
+
+//
+// subblock allocator
+//
+
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+
+static
+skc_block_id_t
+skc_subblocks_get_next_pb_id(skc_block_id_t                * const subblocks,
+                             skc_uint                      * const blocks_next,
+                             skc_uint                      * const blocks_idx,
+                             skc_block_id_v_t              * const blocks,
+                             skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                             __global skc_block_id_t const * const bp_ids)
+{
+  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+    }
+
+  skc_block_id_t const pb_id = *subblocks;
+
+  *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
+
+  return pb_id;
+}
+
+#endif
+
+//
+// append a ttsk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_sk(skc_ttsk_s_t            const * const sk_s,
+
+                     skc_ttxk_v_t                  * const xk_v,
+                     skc_uint                      * const xk_v_next,
+                     skc_uint                      * const xk_v_idx,
+                     __global skc_bp_elem_t        * const bp_elems,
+
+                     skc_int                         const rkpk_rem,
+
+                     skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask,
+                     __global skc_block_id_t const * const bp_ids)
+{
+  //
+  // Append an sk key to the in-register xk_v vector
+  //
+  // If the work-in-progress node in gmem will only have room for one
+  // more key then:
+  //
+  //   - if this was the final SK then write out xk_v and exit
+  //
+  //   - otherwise, acquire a block id, link it, write out xk_v,
+  //     prepare new node
+  //
+  // Note that this does *not* try to squeeze in a final key into the
+  // next node slot.  This optimization isn't worth the added
+  // down-pipeline complexity.
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+    {
+      *xk_v = *sk_s;
+    }
+
+  *xk_v_next += 1;
+
+  // are there more keys coming?
+  if (rkpk_rem > 0)
+    {
+      // is the node almost full?
+      if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+        {
+          skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+          if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+            {
+              xk_v->lo = id;
+              xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+            }
+
+          // store xk_v (uint2) to bp (uint)
+          bp_elems[*xk_v_idx                         ] = xk_v->lo;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+          printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+          // reinitialize xk_v
+          xk_v->lo = SKC_UINT_MAX;
+          xk_v->hi = SKC_UINT_MAX;
+
+          // update node elem idx
+          *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+          // reset node count
+          *xk_v_next = 0;
+        }
+      // is xk_v full?
+      else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+        {
+          // store xk_v to bp
+          bp_elems[*xk_v_idx                         ] = xk_v->lo;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+          printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+          // reinitialize xk_v
+          xk_v->lo = SKC_UINT_MAX;
+          xk_v->hi = SKC_UINT_MAX;
+
+          // increment node elem idx
+          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+        }
+    }
+  else
+    {
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+        {
+          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+          bp_elems[*xk_v_idx]                          = SKC_UINT_MAX;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+        }
+    }
+
+#else
+  //
+  // SIMD
+  //
+
+#endif
+}
+
+//
+//
+//
+
+static
+skc_ttpk_s_t
+skc_ttpk_create(skc_raster_yx_s const yx_prev,
+                skc_raster_yx_s const yx_next,
+                skc_block_id_t  const pb_id)
+{
+  // - yx_prev is already incremented by one 
+  // - yx_span is already shifted up at hi.x
+  skc_uint const yx_span = yx_next - yx_prev;
+
+  skc_ttpk_s_t pk;
+
+  // turn on prefix bit | shift span bits upward
+  pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
+
+  // shift down high span bits | yx of tile
+  pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("* %08v2X : %u\n",pk,yx_span);
+#endif
+
+  return pk;
+}
+
+//
+// append a ttpk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_pk(skc_ttpk_s_t            const * const pk_s,
+
+                     skc_ttxk_v_t                  * const xk_v,
+                     skc_uint                      * const xk_v_next,
+                     skc_uint                      * const xk_v_idx,
+                     __global skc_bp_elem_t        * const bp_elems,
+
+                     skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask,
+                     __global skc_block_id_t const * const bp_ids)
+{
+  //
+  // append a pk key to the in-register xk_v vector
+  //
+  // if the work-in-progress node in gmem will only have room for one
+  // more key then:
+  //
+  //   - if this was the final SK then write out xk_v and exit
+  //
+  //   - otherwise, acquire a block id, link it, write out xk_v,
+  //     prepare new node
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+    {
+      *xk_v = *pk_s;
+    }
+
+  *xk_v_next += 1;
+
+  // is the node almost full?
+  if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+    {
+      skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+      if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+        {
+          xk_v->lo = id;
+          xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+        }
+
+      // store xk_v to bp
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      // reinitialize xk_v
+      xk_v->lo = SKC_UINT_MAX;
+      xk_v->hi = SKC_UINT_MAX;
+
+      // update node elem idx
+      *xk_v_idx  = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+      // reset node count
+      *xk_v_next = 0;
+    }
+  // is xk_v full?
+  else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+    {
+      // store xk_v to bp
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      // reinitialize xk_v
+      xk_v->lo = SKC_UINT_MAX;
+      xk_v->hi = SKC_UINT_MAX;
+      
+      // increment node elem idx
+      *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+    }
+
+#else
+  //
+  // SIMD
+  //
+#endif
+}
+
+//
+// append the first 3 fields of meta info to the raster header
+//
+
+static
+void
+skc_node_v_init_header(skc_ttxk_v_t                           * const xk_v,
+                       skc_uint                               * const xk_v_next,
+                       union skc_raster_cohort_meta_out const * const meta)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() < 2)
+    {
+      *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
+    }
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("header: %08v4X\n",meta->u32v4);
+#endif
+
+  //
+  // increment counter: uint4 + uint4 = uint2 x 4
+  //
+  *xk_v_next = 2 + 2; // +2 for unitialized bounds
+
+#else
+  //
+  // SIMD
+  //
+
+#endif
+}
+
+//
+//
+//
+
+__kernel
+SKC_PREFIX_KERNEL_ATTRIBS
+void
+skc_kernel_prefix(__global skc_uint       const * const bp_atomics,
+                  __global skc_block_id_t const * const bp_ids,
+                  __global skc_bp_elem_t        * const bp_elems,
+                  skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                  __global skc_ttrk_e_t   const * const rks,
+                  __global skc_block_id_t       * const map,
+                  __global skc_uint       const * const metas,
+                  skc_uint                        const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem                  smem[1];
+#else
+  __local struct skc_subgroup_smem                  smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
+#endif
+
+  //
+  // where is this subgroup in the grid?
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const sgi = get_group_id(0);
+#else
+  skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  skc_uint const sgl = get_sub_group_local_id();
+
+  //
+  // return if this subgroup is excess
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
+  if (sgi >= count)
+    return;
+#endif
+
+  //
+  // get meta info for this subgroup's raster
+  //
+  union skc_raster_cohort_meta_out const meta  = { vload4(sgi,metas) };
+  skc_uint                         const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("%3u : %5u / %5u / %5u / %5u / %u\n",
+           sgi,
+           meta.blocks,
+           meta.offset,
+           meta.nodes,
+           meta.keys,
+           reads);
+#endif
+
+  //
+  // preload blocks -- align on subgroup
+  //
+  skc_uint         blocks_idx  = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+  skc_block_id_v_t blocks      = bp_ids[blocks_idx & bp_mask];
+  skc_uint         blocks_next = (reads &  SKC_PREFIX_SUBGROUP_MASK);
+
+  //
+  // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
+  //
+  skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+  //
+  // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
+  //
+  skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
+  skc_uint     xk_v_next;
+
+  skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
+
+  //
+  // no keys -- this is an empty raster!
+  //
+  if (meta.keys == 0)
+    {
+      bp_elems[xk_v_idx                         ] = xk_v.lo;
+      bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
+
+      while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+        {
+          xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+          bp_elems[xk_v_idx]                          = SKC_UINT_MAX;
+          bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+        }
+
+      return;
+    }
+
+  //
+  // load TTRK keys and in-place convert to TTSK keys
+  //
+  skc_uint         rks_next    = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+  skc_ttsk_v_t     sk_v        = rks[rks_next];
+  skc_uint         sk_next     = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
+  skc_int          rkpk_rem    = meta.keys; // signed count of remaining rk+pk keys
+
+#if 0
+  printf("* %08X ( %3u, %3u )\n",
+         sk_v.hi,
+         (sk_v.hi >> 12) & 0xFFF,
+         (sk_v.hi      ) & 0xFFF);
+#endif
+  
+  skc_ttrk_to_ttsk(&sk_v);
+
+#if 0
+  printf("! %08X ( %3u, %3u )\n",
+         sk_v.hi,
+         (sk_v.hi >> 20) & 0xFFF,
+         (sk_v.hi >>  8) & 0xFFF);
+#endif
+
+  //
+  // subblocks
+  //
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+  skc_block_id_t subblocks = 0;
+#endif
+
+  //
+  // begin "scan" of tiles
+  //
+  skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
+
+  //
+  // zero the accumulator
+  //
+  skc_accum_reset(smem);
+
+  while (true)
+    {
+      // get next rk key
+      skc_ttsk_s_t     const sk_s  = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
+
+      // load ttsb id
+      skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
+
+      // load tts_v transaction "in flight" as early as possible
+      skc_tts_v_t      const tts_v = skc_load_tts(bp_elems,sb_id);
+
+#if 0
+      printf("{ %08X }\n",tts_v);
+#endif
+
+#if 0
+      if (get_sub_group_local_id() == 0)
+        printf("[ %d, %X ]\n",rkpk_rem,sb_id);
+#endif
+
+#if 0
+      if (get_sub_group_local_id() == 0)
+        printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
+#endif
+
+      //
+      // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
+      // TIME AND SIMD'IZED
+      //
+
+      // if yx's don't match then we're either issuing a ttpk or
+      // resetting the accumulator
+      if (sk_s.hi != yx_prev)
+        {
+          // if yx_next.y == yx_last.y then x changed
+          if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
+            {
+              //
+              // if the tile is not square then it's ratio is 1:2
+              //
+#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
+              skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
+                                                                        &blocks_next,
+                                                                        &blocks_idx,
+                                                                        &blocks,
+                                                                        bp_mask,
+                                                                        bp_ids);
+#else
+              skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
+                                                               &blocks_idx,
+                                                               &blocks,
+                                                               bp_mask,
+                                                               bp_ids);
+#endif
+
+              // flush accumulated ttp vector to block/subblock at ttpb_id
+              skc_accum_flush(smem,bp_elems,pb_id);
+
+#if 0
+              if (get_sub_group_local_id() == 0)
+                {
+                  printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
+                         pb_id,
+                         (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
+                         (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
+                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
+                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
+                }
+#endif
+
+              //
+              // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
+              //
+              rkpk_rem -= 1;
+
+              // create the pk
+              skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
+
+              // append pk key to xk buffer
+              skc_node_v_append_pk(&pk_s,
+
+                                   &xk_v,
+                                   &xk_v_next,
+                                   &xk_v_idx,
+                                   bp_elems,
+
+                                   &blocks_next,
+                                   &blocks_idx,
+                                   &blocks,
+                                   bp_mask,
+                                   bp_ids);
+            }
+          else if (rkpk_rem > 0) // we're starting a new tile row
+            {
+              skc_accum_reset(smem);
+            }
+        }
+
+      //
+      // append sk key to node_v
+      //
+      // if rkpk_rem is zero then return from kernel
+      //
+      skc_node_v_append_sk(&sk_s,
+
+                           &xk_v,
+                           &xk_v_next,
+                           &xk_v_idx,
+                           bp_elems,
+
+                           rkpk_rem,
+
+                           &blocks_next,
+                           &blocks_idx,
+                           &blocks,
+                           bp_mask,
+                           bp_ids);
+
+      // we're done if no more sk keys
+      if (rkpk_rem == 0)
+        break;
+
+      // move to new tile
+      yx_prev = sk_s.hi;
+
+      // scatter tts values into accumulator
+      skc_accum_scatter(smem,tts_v);
+
+      // replenish sk keys
+      skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
new file mode 100644
index 0000000000..e622845d9c
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
@@ -0,0 +1,3366 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+// #define SKC_ARCH_AVX2
+// #define SKC_RASTERIZE_SIMD_USES_SMEM
+
+#define PRINTF_ENABLE       0
+#define PRINTF_BLOCK_COUNT  0
+
+//
+// NOTE:
+//
+// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
+// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
+//
+// NOTE:
+//
+// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
+//
+//
+
+#if 0 // SKC_ARCH_AVX2
+
+// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
+// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
+// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
+
+// #define SKC_TTXB_WORDS                           8
+
+// #define SKC_RASTERIZE_FLOAT                      float8
+// #define SKC_RASTERIZE_UINT                       uint8
+// #define SKC_RASTERIZE_INT                        int8
+// #define SKC_RASTERIZE_PREDICATE                  int8
+
+// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
+// #define SKC_RASTERIZE_BIN                        uint8
+
+// #define SKC_RASTERIZE_POOL                       uint8
+// #define SKC_RASTERIZE_POOL_SCALE                 6
+
+// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
+// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
+
+// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
+
+#endif
+
+//
+// SIMT
+//
+
+#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
+#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
+#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
+
+//
+//
+//
+
+#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
+#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
+#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices. For example,
+// on GEN9 there is an LRP "linear interoplation" opcode but it
+// doesn't appear to support half floats.
+//
+// Feel free to toggle this option and then benchmark and inspect the
+// generated code.  We really want the double FMA to be generated when
+// there isn't support for a LERP/MIX operation.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)      mix(a,b,t)
+#endif
+
+//
+// There is no integer MAD in OpenCL with "don't care" overflow
+// semantics.
+//
+// FIXME -- verify if the platform needs explicit MAD operations even
+// if a "--fastmath" option is available at compile time.  It might
+// make sense to explicitly use MAD calls if the platform requires it.
+//
+
+#if 1
+#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
+#else
+#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
+#endif
+
+//
+//
+//
+
+#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
+
+//
+//
+//
+
+union skc_bp_elem
+{
+  skc_uint              u32;
+  skc_tagged_block_id_t tag_id;
+  skc_float             coord;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+  //
+  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
+  struct {
+    union {
+
+      skc_uint                winner;
+
+      struct {
+        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } aN;
+
+      struct {
+        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } vN;
+    };
+  } subgroup;
+#endif
+
+  //
+  // work-in-progress TTSB blocks and associated YX keys
+  //
+  union {
+    struct {
+      // FIXME -- some typedefs are valid here
+      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
+      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+    } aN;
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+    struct {
+      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      SKC_RASTERIZE_BIN       yx;
+      SKC_RASTERIZE_BIN       id;
+      SKC_RASTERIZE_BIN       count;
+    } vN;
+#endif
+  } bin;
+};
+
+//
+//
+//
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+#define skc_subgroup_lane()  0
+#else
+#define skc_subgroup_lane()  get_sub_group_local_id()
+#endif
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint                           * const blocks_next,
+                     skc_block_id_v_t                   * const blocks,
+                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                     __global skc_block_id_t   const    * const bp_ids)
+{
+  //
+  // get a new vector of block ids -- this is kind of a narrow
+  // allocation but subblocks help stretch out the pool.
+  //
+  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
+  //
+  skc_uint bp_idx = 0;
+
+  if (skc_subgroup_lane() == 0)
+    {
+      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
+                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
+#if 0
+      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
+#endif
+    }
+
+  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
+  *blocks      = bp_ids[bp_idx];
+  *blocks_next = 0;
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint                           * const blocks_next,
+                    skc_block_id_v_t                   * const blocks,
+                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                    __global skc_block_id_t   const    * const bp_ids)
+{
+  // replenish?
+  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
+    {
+      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+  //
+  // SIMD
+  //
+  skc_block_id_t id = blocks->s0;
+
+  skc_shuffle_down_1(*blocks);
+
+#endif
+
+  *blocks_next += 1;
+
+  return id;
+}
+
+//
+// subblock allocator
+//
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+
+static
+skc_block_id_t
+skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
+                       skc_uint                           * const blocks_next,
+                       skc_block_id_v_t                   * const blocks,
+                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                       __global skc_block_id_t   const    * const bp_ids)
+{
+  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+  skc_block_id_t const sb_id = *subblocks;
+
+  *subblocks += 1;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("= %u\n",sb_id);
+#endif
+
+  return sb_id;
+}
+
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
+
+#else
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
+
+#endif
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
+                  skc_uint                           * const blocks_next,
+                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                  __global skc_block_id_t   const    * const bp_ids,
+                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
+                  skc_ttsk_v_t                       * const sk_v,
+                  skc_uint                           * const sk_v_next,
+                  __global skc_ttsk_s_t              * const sk_extent,
+                  skc_uint                             const new_yx)
+{
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
+                                                       blocks_next,
+                                                       blocks,
+                                                       bp_atomics,
+                                                       bp_mask,
+                                                       bp_ids);
+#else
+  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
+                                                    blocks,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids);
+#endif
+
+  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
+    {
+      sk_v->lo = new_id;
+      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
+#if 0
+      printf("@ ( %3u, %3u ) %u\n",
+             (new_yx >> 12) & 0xFFF,
+             (new_yx      ) & 0xFFF,
+             new_id);
+#endif
+    }
+
+  *sk_v_next += 1;
+
+  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
+    {
+      *sk_v_next = 0;
+
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
+#if 0
+          printf("+ %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
+      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
+#endif
+        {
+          sk_extent[sk_idx] = *sk_v;
+#if 0
+          printf("> %u : %v2u\n",sk_idx,*sk_v);
+#endif
+        }
+    }
+
+  return new_id;
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  // 00 max
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = max(v.s00,v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  // 0012 +
+  // ----
+  // 0123
+  // 0101 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
+  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  // 00123456 +
+  // --------
+  // 01234567
+  // 01012345 +
+  // --------
+  // 01234567
+  // 01230123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
+  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  // 00123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  // 010123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  // 01230123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  // 0123456701234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
+  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
+  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_max(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return v.s0;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,0);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
+                      SKC_RASTERIZE_UINT  const i)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return shuffle(v,i);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle(v,i);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
+                          SKC_RASTERIZE_FLOAT const c) // current
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // FIXME -- there are alternative formulations here:
+  //
+  // Option 1:
+  //
+  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
+  //
+  // Option 2:
+  //
+  //   p is a scalar
+  //   t    = c.rotate(+1)
+  //   t.s0 = p;
+  //
+  // Option 3: ...
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return p;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return shuffle2(p,c,(uint2)(1,2));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return shuffle2(p,c,(uint4)(3,4,5,6));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle_up(p,c,1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_is_lane_first()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+  //
+  // SIMD
+  //
+  return true;
+#else
+  //
+  // SIMT
+  //
+  return get_sub_group_local_id() == 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_delta_offset()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return 1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return 1.0f + get_sub_group_local_id();
+
+#endif
+
+}
+
+//
+//
+//
+
+static
+int
+skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return any(p);
+#else
+  //
+  // SIMT
+  //
+  return sub_group_any(p);
+#endif
+}
+
+//
+//
+//
+
+#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
+
+void
+skc_segment_next(__global union skc_bp_elem * const bp_elems,
+                 skc_uint                   * const nodeword,
+                 skc_block_id_t             * const id)
+{
+  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
+        {
+          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
+        }
+
+      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
+
+      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+    }
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
+{
+  return native_sqrt(x * x + y * y);
+}
+
+//
+// Wang's Formula (1985)
+//
+
+#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
+
+#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
+
+#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
+#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
+
+#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
+#define SKC_WANG_SQRT(x)      native_sqrt(x)
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
+                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
+{
+  //
+  // Return the number of evenly spaced (in the parametric sense) line
+  // segments that are guaranteed to be within "epsilon" error of the
+  // curve.
+  //
+  // We're then going to take multiples of the reciprocal of this
+  // number so that the segmentation can be distributed across the
+  // subgroup.
+  //
+  // Note, this can probably be slightly optimized per architecture
+  // but it's probably far from being a hotspot since it's all
+  // straight-line unpredicated code.
+  //
+  // The result is an integer ranging from [1.0,#segments]
+  //
+  // Note that even if all of the control points are coincident, the
+  // max(1.0f) will categorize this as a line of 1 segment.
+  //
+  // This is what we want!  We want to convert cubics to lines as
+  // easily as possible and *then* cull lines that are either
+  // horizontal or zero length.
+  //
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
+                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
+                                                    fabs(t3x - 2.0f * t2x + t1x)),
+                                                max(fabs(t2y - 2.0f * t1y + t0y),
+                                                    fabs(t3y - 2.0f * t2y + t1y))))));
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
+{
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
+                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
+                                                fabs(t2y - 2.0f * t1y + t0y)))));
+}
+
+//
+// rational curves
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic_rat()
+{
+  return 0.0f;
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quad_rat()
+{
+  return 0.0f;
+}
+
+//
+// flush any work-in-progress blocks and return unused block ids
+//
+
+static
+void
+skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+             __global union skc_bp_elem                 * const bp_elems,
+             __global uint                              * const bp_ids,
+             skc_uint                                     const bp_mask,
+             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+             skc_block_id_v_t                           * const blocks,
+             skc_uint                                     const blocks_next,
+             skc_ttsk_v_t                               * const sk_v,
+             skc_uint                                     const sk_v_next,
+             __global skc_ttsk_s_t                      * const sk_extent,
+             __local  struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // flush non-empty bins
+  //
+  // FIXME -- accelerate this iteration/search with a subgroup operation
+  //
+  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
+    {
+      if (smem->bin.aN.count[ii] > 0)
+        {
+          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
+          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
+#if 0
+          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
+#endif
+          bp_elems[idx].u32 = tts;
+        }
+
+      //
+      // FIXME -- vectorize with vstoreN()
+      //
+    }
+
+  //
+  // return remaining block ids back to the pool
+  //
+  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
+
+  if (blocks_rem > 0)
+    {
+      skc_uint bp_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
+
+#if 0
+          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
+#endif
+        }
+
+      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
+
+      if (skc_subgroup_lane() >= blocks_next)
+        {
+          bp_ids[bp_idx] = *blocks;
+        }
+    }
+
+  //
+  // flush work-in-progress ryx keys
+  //
+  if (sk_v_next > 0)
+    {
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
+#if 0
+          printf("* %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+      if (skc_subgroup_lane() < sk_v_next)
+        {
+          sk_extent[sk_idx] = *sk_v;
+        }
+    }
+}
+
+//
+// If there are lanes that were unable to append to a bin because
+// their hashes collided with a bin's current ryx key then those bins
+// must be ejected.
+//
+// Note that we do not eject "full" bins because lazily waiting for a
+// collision results in simpler code.
+//
+
+static
+void
+skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+          __global union skc_bp_elem                 * const bp_elems,
+          __global uint                              * const bp_ids,
+          skc_uint                                     const bp_mask,
+          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+          skc_block_id_t                             * const subblocks,
+          skc_block_id_v_t                           * const blocks,
+          skc_uint                                   * const blocks_next,
+          skc_ttsk_v_t                               * const sk_v,
+          skc_uint                                   * const sk_v_next,
+          __global skc_ttsk_s_t                      * const sk_extent,
+          __local  struct skc_subgroup_smem volatile * const smem,
+          SKC_RASTERIZE_UINT                           const hash,
+          SKC_RASTERIZE_UINT                           const yx,
+          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+
+  //
+  // FIXME -- this code is now stale with the changes to the
+  // subblock/block allocation strategy
+  //
+
+  //
+  // get local TTSB ID queue count
+  //
+  skc_uint ttsb_id_count  = smem->pool.count; // scalar
+
+  // init hash bit mask
+  skc_uint component_mask = 0;
+
+  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
+    {
+      // if no collision continue
+      if (((int*)&is_collision)[cc] == 0)
+        continue;
+
+      uint const winner        = ((uint*)&hash)[cc];
+      uint const component_bit = 1u << winner;
+
+      // if already processed this hash then continue
+      if (component_mask & component_bit)
+        continue;
+
+      // update component mask
+      component_mask |= component_bit;
+
+      //
+      // new winner requires ejecting the old TTSB
+      //
+      if (smem->bin.aN.count[winner] > 0)
+        {
+          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+        }
+
+        //
+        // ensure there is at least one TTSK and TTSB ID
+        //
+        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
+          {
+            //
+            // update remaining count
+            //
+            ttsb_id_count = 0;
+
+            //
+            // flush accumulated ttsk_ryx keys
+            //
+            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
+
+#if 0
+            printf("# %u\n",idx);
+#endif
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              {
+                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
+              }
+
+            //
+            // allocate more ttsb ids from pool
+            //
+            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              smem->pool.aN.id[ii] = bp_ids[id + ii];
+          }
+
+      //
+      // invalidate the winning block
+      //
+
+      //
+      // update bin with winning yx, new ttsb id and zero count
+      //
+      // all lanes are loading/storing from/to the same index
+      //
+      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
+      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
+      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
+      smem->bin.aN.count[winner] = 0;
+
+      //
+      // update count
+      //
+      ttsb_id_count += 1;
+    }
+
+  //
+  // save count
+  //
+  smem->pool.count = ttsb_id_count;
+
+#else
+  //
+  // SIMT
+  //
+
+  do {
+    //
+    // only one lane will win!
+    //
+    if (is_collision)
+      smem->subgroup.winner = hash;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // which bin is being ejected?
+    //
+    skc_uint const winner = smem->subgroup.winner;
+
+    //
+    // which colliding hash is taking over the bin?
+    //
+    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
+
+    //
+    // all lanes with the same hash will try to store but only one
+    // lane will win
+    //
+    if (is_winner)
+      smem->subgroup.winner = yx;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // flush this block to the pool
+    //
+    if (smem->bin.aN.count[winner] > 0)
+      {
+        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
+        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+#if 0
+        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
+#endif
+        bp_elems[idx].u32 = tts;
+      }
+
+    //
+    // append new ttsk
+    //
+    skc_uint       const new_yx = smem->subgroup.winner;
+    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
+                                                    blocks_next,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids,
+                                                    cohort_atomics,
+                                                    sk_v,
+                                                    sk_v_next,
+                                                    sk_extent,
+                                                    new_yx);
+
+#if 0
+    if (get_sub_group_local_id() == 0) {
+      printf(">>> %9u\n",new_id);
+    }
+#endif
+
+    //
+    // update bin with winning yx, new ttsb id and zero count
+    //
+    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
+    smem->bin.aN.yx   [winner]                      = new_yx;
+    smem->bin.aN.id   [winner]                      = new_id;
+    smem->bin.aN.count[winner]                      = 0;
+
+    //
+    // remove all lanes matching this hash
+    //
+    is_collision = is_collision && !is_winner;
+
+    //
+    // exit if nothing left to do
+    //
+  } while (sub_group_any(is_collision));
+
+#endif
+}
+
+//
+// scatter scan max
+//
+static
+SKC_RASTERIZE_UINT
+skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
+                     SKC_RASTERIZE_FLOAT                         const iss,
+                     SKC_RASTERIZE_FLOAT                         const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
+  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
+  //
+  // SIMD APPROACH 1: SIMT'ISH
+  //
+
+  // zero the volatile smem scratchpad using vector syntax
+  smem->subgroup.vN.scratch[0] = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                         \
+  if (is_scratch_store C)                               \
+    smem->subgroup.aN.scratch[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  // propagate lanes to right using max scan
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+
+#else
+  //
+  // SIMD APPROACH 2: SCALAR'ISH
+  //
+
+  SKC_RASTERIZE_UINT source = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                 \
+  if (is_scratch_store C)                       \
+    ((uint *)&source)[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
+    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store)
+    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
+
+  //
+  // propagate lanes to right using max scan
+  //
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+#endif
+
+  return source;
+}
+
+//
+// sliver lines into subpixels
+//
+
+static
+void
+skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+           __global union skc_bp_elem                 * const bp_elems,
+           __global uint                              * const bp_ids,
+           skc_uint                                     const bp_mask,
+           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+           skc_block_id_t                             * const subblocks,
+           skc_block_id_v_t                           * const blocks,
+           skc_uint                                   * const blocks_next,
+           skc_ttsk_v_t                               * const sk_v,
+           skc_uint                                   * const sk_v_next,
+           __global skc_ttsk_s_t                      * const sk_extent,
+           __local  struct skc_subgroup_smem volatile * const smem,
+           SKC_RASTERIZE_FLOAT                          const l0x,
+           SKC_RASTERIZE_FLOAT                          const l0y,
+           SKC_RASTERIZE_FLOAT                          const l1x,
+           SKC_RASTERIZE_FLOAT                          const l1y)
+{
+  //
+  // Y-SLIVERING
+  // -----------
+  //
+  // immediately sliver all multi-pixel lines in into 1-pixel high
+  // lines
+  //
+  // note this implicitly squelches horizontal lines
+  //
+  // there is another test for horizontal lines after x-slivering
+  // is complete
+  //
+
+  //
+  // will we need to flip the sign of y_delta ?
+  //
+  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
+  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
+
+  //
+  // save 1/dy
+  //
+  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
+
+  //
+  // how many non-horizontal subpixel y-axis slivers are there?
+  //
+  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
+  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
+
+  //
+  // inclusive subgroup scan of y_segs
+  //
+  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
+  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
+  float                     y_rem   = skc_subgroup_last_float(y_iss);
+
+  //
+  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
+  //
+  if (y_segs == 0.0f)
+    y_iss = 0.0f;
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
+#endif
+
+  //
+  // these values don't matter on first iteration
+  //
+  SKC_RASTERIZE_FLOAT n1x_prev = 0;
+  SKC_RASTERIZE_FLOAT n1y_prev = 0;
+
+  //
+  // loop until done
+  //
+  while (y_rem > 0.0f)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
+
+      //
+      // get line at y_source line
+      //
+      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
+      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
+      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
+      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
+
+      //
+      // every lane will create a 1 pixel tall line "sliver"
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if numerator == 1 then this is the first lane
+      // if numerator == s then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
+      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
+
+      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
+
+      // toggle y_delta sign
+      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
+
+      //
+      // calculate "right" line segment endpoint
+      //
+      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
+      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
+      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
+
+      //
+      // override c1 if this is last point
+      //
+      n1y = select(n1y,m1y,is_y_last);
+      n1x = select(n1x,m1x,is_y_last);
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
+      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      n0y = select(n0y,m0y,is_y_first);
+      n0x = select(n0x,m0x,is_y_first);
+
+      //
+      // save previous right endpoint
+      //
+      n1x_prev = n1x;
+      n1y_prev = n1y;
+
+      //
+      // decrement by subgroup size
+      //
+      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+#if 0
+      //
+      // debug
+      //
+      if (n0y != n1y) {
+        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
+      }
+#endif
+
+      //
+      // X-SLIVERING
+      // -----------
+      //
+      // now sliver 1-pixel high lines into at either vertical or
+      // 1-pixel wide lines
+      //
+      // save original direction and work with increasing x
+      //
+      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
+      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
+
+      //
+      // save 1/dy
+      //
+      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
+
+      //
+      // how many non-horizontal subpixel y-axis slivers are there?
+      //
+      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
+      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
+
+      //
+      // inclusive subgroup scan of y_segs
+      //
+      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
+      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
+      float                     x_rem    = skc_subgroup_last_float(x_iss);
+
+      //
+      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
+      //
+      //if (x_segs == 0.0f)
+      // x_iss = 0.0f;
+
+      //
+      // these values don't matter on first iteration
+      //
+      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
+      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
+
+      //
+      // loop until done
+      //
+      while (x_rem > 0)
+        {
+          //
+          // distribute work across lanes
+          //
+          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
+
+          //
+          // get line at y_source line
+          //
+          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
+          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
+          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
+          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
+
+          //
+          // every lane will create a 1 pixel tall line "sliver"
+          //
+          // FIXME -- this gets expanded on SIMD
+          //
+          // if numerator == 1 then this is the first lane
+          // if numerator == s then this is the last  lane
+          //
+          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
+          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
+
+          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
+          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
+
+          // toggle x_delta sign
+          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
+
+          //
+          // calculate "right" line segment endpoint
+          //
+          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
+          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
+          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
+
+          //
+          // override c1 if this is last point
+          //
+          p1x = select(p1x,o1x,is_x_last);
+          p1y = select(p1y,o1y,is_x_last);
+
+          //
+          // shuffle up "left" line segment endpoint
+          //
+          // NOTE: Intel's shuffle_up is unique with its elegant
+          // "previous" argument so don't get used to it
+          //
+          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
+          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
+
+          //
+          // override shuffle up if this is the first line segment
+          //
+          p0x = select(p0x,o0x,is_x_first);
+          p0y = select(p0y,o0y,is_x_first);
+
+          //
+          // save previous right endpoint
+          //
+          p1x_prev = p1x;
+          p1y_prev = p1y;
+
+          //
+          // decrement by subgroup size
+          //
+          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+          //
+          // only non-horizontal subpixel lines are valid
+          //
+          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
+
+          //
+          // if no lanes are active then continue
+          //
+          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
+          // IMPACTS PERFORMANCE (+12% ?)
+          //
+          // IT SHOULDN'T !!!
+          //
+#if 0
+          if (!skc_subgroup_any(is_active))
+            continue;
+#endif
+
+          //
+          // Option 1: use SLM for explicitly managed coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then SLM append keys
+          // 5. if tile doesn't match
+          //   a. flush
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then GMEM append keys
+          // 5. if tile doesn't match
+          //   a. flush (and invalidate empty elems)
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // The virtual rasterization surface is very large and
+          // signed: +/- ~64K-256K, depending on the architecture.
+          //
+          // Rasters must be clipped to the virtual surface and,
+          // optionally, clipped even further on a per raster
+          // basis.
+          //
+
+          //
+          // Clip to the per-raster clip
+          //
+
+          /*
+
+            CLIP HERE
+
+          */
+
+          //
+          // Hash the tile coordinates
+          //
+          // This table lists nominal values for each architecture.
+          // We want to choose values that are naturally fit the
+          // "width" of the architecture.
+          //
+          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
+          //   ----  -------  ----  ---------  --------  ---------
+          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
+          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
+          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
+          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
+          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
+          //
+          // NOTE: When possible, bias the hash toward using more y
+          // bits because of:
+          //
+          //   1. the 90 degree counter-clockwise rotation that we put
+          //      in place to offset the render-time clockwise
+          //      rotation
+          //
+          //   2. the likely presence of left-to-right or
+          //      right-to-left glyphs.
+          //
+          // For power-of-two bins, the hash is easy.
+          //
+          // For non-power-of-two, we may want to either implement a
+          // fast mod (compiler should do this for us... hahahaha) or
+          // drop down to the next power-of-two.
+          //
+
+          //
+          // FIXME -- this snarl is not good -- can probably reduce
+          // some of the sign casting but some is there to vectorize a
+          // scalar
+          //
+          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
+          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
+
+          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
+          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
+
+          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
+          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
+
+          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
+
+          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
+          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
+
+          //
+          // map [+1,+32] to [ 0,+31]
+          // map [-1,-32] to [-1,-32]
+          //
+          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
+
+          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
+          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
+          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
+
+          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
+          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
+
+          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
+
+          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
+                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
+
+          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
+
+#if 0
+          printf("(%3u, %3u)\n",tile_y,tile_x);
+#endif
+
+#if 0
+          if (is_active)
+            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
+#endif
+
+          //
+          // debug
+          //
+#if 0 // PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+          if (is_active C)                                              \
+            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
+
+          SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+          if (is_active)
+            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
+#endif
+
+#endif
+          //
+          // flush all active lanes
+          //
+          while (true)
+            {
+              //
+              // either gather load or vector load+shuffle the yx keys
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
+              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
+#endif
+
+              //
+              // does yx for lane match yx for hash?
+              //
+              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
+              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
+
+              //
+              // OpenCL spec: "When casting a bool to a vector integer
+              // data type, the vector components will be set to -1
+              // (i.e. all bits set) if the vector bool value is true
+              // and 0 otherwise.
+              //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
+#else
+              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
+#endif
+              //
+              // how many new elements for each matching hash bin?
+              //
+              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
+              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
+
+              //
+              // prefix sum all of the bins in parallel
+              //
+              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
+              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
+
+              //
+              // current bin counts
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
+              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
+#endif
+
+              //
+              // calculate where each cache-hit and in-bounds tts should be stored
+              //
+              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
+              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
+
+              //
+              // which lanes can append to a matching bin?
+              //
+              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
+
+              //
+              // scatter append tts elements to bin blocks
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+              //
+              // SIMD
+              //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+              if (is_append C)                                          \
+                {                                                       \
+                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
+                  smem->bin.aN.count[hash C]               = count_new C; \
+                }
+
+              SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+              //
+              // SIMT
+              //
+              if (is_append)
+                {
+                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
+                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
+                }
+#endif
+              //
+              // try to keep predicate updates SIMD-friendly and
+              // outside of predicated code paths -- this is not
+              // always how we would normally do things on SIMT but
+              // either approach is acceptable
+              //
+
+              //
+              // mask off lanes/components that successfully appended
+              //
+              is_active = is_active && !is_append;
+
+              //
+              // are there any active lanes left?
+              //
+              if (!skc_subgroup_any(is_active))
+                break;
+
+              //
+              // There are active lanes that couldn't be appended to a
+              // bin because their hashes collided with the bin's
+              // current ryx key then those bins must be ejected.
+              //
+              // Note that we do not eject "full" bins because lazily
+              // waiting for a collision results in simpler code.
+              //
+              skc_flush(bp_atomics,
+                        bp_elems,
+                        bp_ids,
+                        bp_mask,
+                        cohort_atomics,
+                        subblocks,
+                        blocks,
+                        blocks_next,
+                        sk_v,
+                        sk_v_next,
+                        sk_extent,
+                        smem,
+                        hash,
+                        yx,
+                        is_active);
+            }
+        }
+    }
+}
+
+//
+// INITIALIZE SMEM
+//
+// Note that SIMD/SIMT have nearly the same syntax.
+//
+static
+void
+skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // initialize smem bins
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
+  smem->bin.vN.count = ( 0 );
+#else
+  //
+  // SIMT
+  //
+  int idx = skc_subgroup_lane();
+
+#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
+#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
+#endif
+    {
+      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
+      smem->bin.aN.count[idx] = ( 0 );
+    }
+#endif
+}
+
+//
+// RASTERIZE CUBIC KERNEL
+//
+
+static
+void
+skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                     __global union skc_bp_elem                * const bp_elems,
+                     __global uint                             * const bp_ids,
+                     skc_uint                                    const bp_mask,
+
+                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                     __global skc_ttsk_s_t                     * const sk_extent,
+
+                     __local struct skc_subgroup_smem volatile * const smem,
+
+                     skc_uint                                  * const nodeword,
+                     skc_block_id_t                            * const id,
+
+                     union skc_transform              const    * const tv,
+                     union skc_path_clip              const    * const cv,
+                     skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);
+
+  //
+  //
+  //
+#if PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
+         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
+         b0x C,b0y C,t1x C,t1y C,                                       \
+         t2x C,t2y C,t3x C,t3y C);
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+#else
+
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
+         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+#endif
+
+#endif
+
+  //
+  // OLD APPROACH
+  // ------------
+  //
+  // The Spinel CUDA rasterizer was significantly more complex and
+  // performed a few different tasks that are probably best kept
+  // separate.
+  //
+  // The Spinel rasterizer Bezier held 4-element x and y coordinates
+  // in adjacent lanes. This simplified intermingling of single lane
+  // 4-coordinate line segments with two-lane cubic Beziers.
+  //
+  // After transformation of the input segments, the Spinel rasterizer
+  // would test cubics for flatness and, if flat, collapse the
+  // adjacent lanes into a single line lane and an empty lane.
+  //
+  // Any lines would then be appended to a line queue.
+  //
+  // Any cubics would then be subdivided.
+  //
+  // The reclassification process would be repeated.
+  //
+  // NEW APPROACH
+  // ------------
+  //
+  // Assume we're only working with cubics in this kernel.
+  //
+  // Optimization: if the line segment is a special case -- a cusp,
+  // has 1+ inflections, or a loop -- it might be beneficial to
+  // subdivide the control cage 1+ times in order to separate the
+  // flatter segments the high-velocity region(s).
+  //
+  // This means we want to split using [a,b] formulation to _directly_
+  // subdivide producing a new control cage.
+  //
+  // Wang's Formula is still useful even if we subdivide once or twice
+  // as it's so cheap that it might give some useful hints about where
+  // the high-velocity sections of curve reside.
+  //
+  // But it seems like using Wang's and directly flattening to line
+  // segments without any subdivision is good enough for the limited
+  // set of test cases that I've tried.
+  //
+  // So... use Wang's Formula to estimate how many line segment are
+  // required to properly flatten the cubics.
+  //
+  // Then use inclusive/exclusive scans to put all the lanes to work:
+  //
+  //   1. segmenting cubics to line segments
+  //
+  //   2. slivering line segments into 1-pixel high line segments
+  //
+  //   3. slivering 1-pixel high line segments into 1-pixel wide line
+  //      segments
+  //
+  // MORE BACKGROUND ON NEW APPROACH
+  // -------------------------------
+  //
+  // Two options for handling line segments:
+  //
+  // 1. append the line segments onto an SLM array until enough
+  //    work has been accrued (Spinel does this)
+  //
+  // 2. immediately sliver the potentially multi-pixel line
+  //    segments into subpixel lines
+  //
+  // The advantage of (1) is that it guarantees the slivering
+  // process will, on average, always be emitting a full subgroup
+  // of subpixel lines.
+  //
+  // The advantage of (2) is that it reduces code complexity and
+  // leaves more room for SLM tile bins. The difference between Spinel
+  // and Skia Compute is that Wang's Formula guarantees there will be
+  // a full subgroup of multi-pixel lines unless this is the final
+  // iteration of the warp of multi-pixel lines.
+  //
+  // Note that wider GPU architectures might benefit from (1) and
+  // other work accumulation strategies because it will minimize
+  // partial warp workloads in the final iteration of each stage.  It
+  // also minimizes the sunk cost of the uniform control logic steps.
+  //
+  // So let's implement (2) for now...
+  //
+
+  //
+  // And... begin!
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute cubic polynomial coefficients from transformed control
+  // cage so we can shuffle them in on each iteration of the outer
+  // loop and then evaluate the polynomial in Horner form.
+  //
+  //                            |  1  0  0  0 | | c0 |
+  //                            |             | |    |
+  //                            | -3  3  0  0 | | c1 |
+  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
+  //                            |  3 -6  3  0 | | c2 |
+  //                            |             | |    |
+  //                            | -1  3 -3  1 | | c3 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
+  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
+      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE QUAD KERNEL
+//
+
+static
+void
+skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);
+
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute quadratic polynomial coefficients from control cage so
+  // we can shuffle them in on each iteration of the outer loop and
+  // then evaluate the polynomial in Horner form.
+  //
+
+  //                        |  1  0  0  | | c0 |
+  //                        |           | |    |
+  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
+  //                        |           | |    |
+  //                        |  1 -2  1  | | c2 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
+  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE LINE KERNEL
+//
+
+static
+void
+skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+#if 0
+  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
+#endif
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only
+  // FIXME -- support perspective later
+  //
+  // the affine transformation requires 8 FMA + 4 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
+#endif
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // sliver lines
+  //
+  skc_sliver(bp_atomics,
+             bp_elems,
+             bp_ids,
+             bp_mask,
+             cohort_atomics,
+             &subblocks,
+             &blocks,
+             &blocks_next,
+             &sk_v,
+             &sk_v_next,
+             sk_extent,
+             smem,
+             l0x,l0y,l1x,l1y);
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                         __global union skc_bp_elem                * const bp_elems,
+                         __global uint                             * const bp_ids,
+                         skc_uint                                    const bp_mask,
+
+                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                         __global skc_ttsk_s_t                     * const sk_extent,
+
+                         __global float8                  const    * const transforms, // FIXME -- __constant
+                         __global float4                  const    * const clips,      // FIXME -- __constant
+                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                         skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("+cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("-cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("[ %u ]< %u, %u, %u, %u >\n",
+           cmd_idx,
+           cmd.nodeword,
+           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
+           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
+           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
+#endif
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  switch (tag)
+    {
+    case SKC_BLOCK_ID_TAG_PATH_LINE:
+      skc_rasterize_lines(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_QUAD:
+      skc_rasterize_quads(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
+      skc_rasterize_cubics(bp_atomics,
+                           bp_elems,
+                           bp_ids,
+                           bp_mask,
+                           cohort_atomics,
+                           sk_extent,
+                           smem,
+                           &nodeword,&id,
+                           &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
+      break;
+    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
+      break;
+
+    default:
+      break;
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_lines(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_quads(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                            __global union skc_bp_elem                * const bp_elems,
+                            __global uint                             * const bp_ids,
+                            skc_uint                                    const bp_mask,
+
+                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                            __global skc_ttsk_s_t                     * const sk_extent,
+
+                            __global float8                  const    * const transforms, // FIXME -- __constant
+                            __global float4                  const    * const clips,      // FIXME -- __constant
+                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                            skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_cubics(bp_atomics,
+                       bp_elems,
+                       bp_ids,
+                       bp_mask,
+                       cohort_atomics,
+                       sk_extent,
+                       smem,
+                       &nodeword,&id,
+                       &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                               __global union skc_bp_elem                * const bp_elems,
+                               __global uint                             * const bp_ids,
+                               skc_uint                                    const bp_mask,
+
+                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                               __global skc_ttsk_s_t                     * const sk_extent,
+
+                               __global float8                  const    * const transforms, // FIXME -- __constant
+                               __global float4                  const    * const clips,      // FIXME -- __constant
+                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                               skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                                __global union skc_bp_elem                * const bp_elems,
+                                __global uint                             * const bp_ids,
+                                skc_uint                                    const bp_mask,
+
+                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                                __global skc_ttsk_s_t                     * const sk_extent,
+
+                                __global float8                  const    * const transforms, // FIXME -- __constant
+                                __global float4                  const    * const clips,      // FIXME -- __constant
+                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                                skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
new file mode 100644
index 0000000000..0c7da7d0ad
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "device_cl_12.h"
+
+//
+// There is a fixed-size meta table per raster cohort that we use to
+// peform a mostly coalesced sizing and allocation of blocks.
+//
+// This code is simple and fast.
+//
+
+__kernel
+SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+                         __global skc_block_id_t  const    * const bp_ids,
+                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t           * const map,
+                         __global skc_uint                 * const metas,
+                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT
+                         skc_uint                            const count)
+{
+  // access to the meta extent is linear
+  skc_uint const gid       = get_global_id(0);
+  skc_bool const is_active = gid < count;
+
+  //
+  // init with defaults for all lanes
+  //
+  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };
+  skc_uint                           raster_id    = SKC_UINT_MAX;
+  skc_uint                           extra_blocks = 0;
+
+  if (is_active)
+    {
+      // load meta_in
+      meta.in.u32v4     = vload4(gid,metas);
+
+      // load raster_id as early as possible
+      raster_id         = raster_ids[gid];
+
+#if 0
+      printf("%3u + %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.in.blocks,
+             meta.in.offset,
+             meta.in.pk,
+             meta.in.rk);
+#endif
+
+      // how many blocks will the ttpb blocks consume?
+      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 
+                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
+
+      // total keys
+      meta.out.keys    += meta.in.pk;
+
+      // how many blocks do we need to store the keys in the head and trailing nodes?
+      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
+                           (SKC_RASTER_NODE_DWORDS - 1));
+      // increment blocks
+      extra_blocks     += hn;
+
+      // how many nodes trail the head?
+      meta.out.nodes    = hn - 1;
+      
+      // update blocks
+      meta.out.blocks  += extra_blocks;
+
+#if 0
+      printf("%3u - %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.out.blocks,
+             meta.out.offset,
+             meta.out.nodes,
+             meta.out.keys);
+#endif
+    }
+
+  //
+  // allocate blocks from block pool
+  //
+  // first perform a prefix sum on the subgroup to reduce atomic
+  // operation traffic
+  //
+  // note this idiom can be implemented with vectors, subgroups or
+  // workgroups
+  //
+  
+  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
+  skc_uint       reads  = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
+    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
+  }
+
+  // broadcast block pool base to all lanes
+  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
+
+  // update base for each lane
+  reads += prefix - extra_blocks;
+
+  //
+  // store meta header
+  //
+  if (is_active)
+    {
+      // store headers back to meta extent
+      vstore4(meta.out.u32v4,gid,metas);
+
+      // store reads
+      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 
+
+      // get block_id of each raster head 
+      skc_block_id_t const block_id = bp_ids[reads & bp_mask];
+
+      // update map
+      map[raster_id] = block_id;
+
+#if 0
+      printf("alloc: %u / %u\n",raster_id,block_id);
+#endif
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
new file mode 100644
index 0000000000..27411cfe96
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
@@ -0,0 +1,442 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
+
+//
+//
+//
+
+#if   ( SKC_RASTERS_RECLAIM_X == 1 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_RASTERS_RECLAIM_X == 2 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_RASTERS_RECLAIM_X == 4 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_RASTERS_RECLAIM_X == 8 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_RASTERS_RECLAIM_X == 16)
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_RASTERS_RECLAIM_X"
+#endif
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
+                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
+
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \
+  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
+
+#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \
+  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \
+  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \
+  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                           __global skc_uint                * const bp_elems,    // block pool blocks
+                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                           __global skc_block_id_t const    * const map,         // raster host-to-device map
+                           struct   skc_reclaim               const reclaim)     // array of host raster ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host raster id
+      skc_raster_h const raster = reclaim.aN[reclaim_idx];
+
+      // get block id of raster header
+      skc_block_id_t     id     = map[raster];
+
+      //
+      // load all of the head block ttxk.lo keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      // load raster header counts -- we only need the blocks and
+      // nodes words the keys are doublewords.
+      //
+      // FIXME -- this can be made portable with compile-time macro expansion
+      //
+      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
+      }
+#endif
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // mask off everything but the block id
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+#if 0
+          printf("rasters next = %u\n",id);
+#endif
+        }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",h##I,h##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+      
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        printf("%08X\n",h##I);                          \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as subblocks
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \
+          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \
+            h##I = SKC_UINT_MAX;                        \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+      }
+
+      // printf("R %7u ! %u\n",bp_ids_idx,h##I);
+            
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
+
+        //
+        // load all of the node block ttxk.lo keys into registers
+        //
+        // FIXME -- this pattern lends itself to using the higher
+        // performance Intel GEN block load instructions
+        //
+        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // mask off everything but the block id
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        n##I = n##I & SKC_TTXK_LO_MASK_ID;
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+#if 0
+            printf("rasters next = %u\n",id);            
+#endif
+          }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",n##I,n##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("R %7u ! %u\n",bp_ids_idx,n##I);
+        
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl
new file mode 100644
index 0000000000..9205334940
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/render.cl
@@ -0,0 +1,2165 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "styling_types.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
+#endif
+
+//
+// tile state flag bits
+//
+
+typedef enum skc_tile_flags_e {
+
+  // FLUSH
+  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
+  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
+  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
+
+  // OPACITY
+  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
+
+  //
+  // Note: testing for opacity and skipping scattering is on its way
+  // to becoming a much more programmable option because sometimes we
+  // may be compositing/blending from back-to-front and/or be using
+  // group blend rules that ignore opacity.
+  //
+  // The point is that all of these decisions should be encoded in
+  // styling commands and, as much as possible, removed from the final
+  // group/layer styling traversal render loop.
+  //
+
+} skc_tile_flags_e;
+
+//
+// COVER -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_cover
+{
+  struct {
+    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COVER_VECTOR
+  struct {
+    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
+  } vN;
+#endif
+};
+
+//
+// COLOR -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_color
+{
+  union {
+    struct {
+      SKC_RENDER_TILE_COLOR           r;
+      SKC_RENDER_TILE_COLOR           g;
+      SKC_RENDER_TILE_COLOR           b;
+      SKC_RENDER_TILE_COLOR           a;
+    } rgba[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+  union {
+    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
+  } iN;
+#endif
+
+#ifdef SKC_RENDER_TILE_COLOR_VECTOR
+  union {
+    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
+  } vN;
+#endif
+
+  struct {
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         r;
+        SKC_RENDER_TILE_COLOR         g;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       distance;
+    };
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         b;
+        SKC_RENDER_TILE_COLOR         a;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
+    };
+  } grad[SKC_TILE_WIDTH];
+};
+
+//
+// SHARED MEMORY STATE
+//
+
+#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
+
+#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
+#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
+
+//
+//
+//
+
+union skc_subgroup_smem
+{
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
+  struct {
+    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } atomic;
+#endif
+
+  struct {
+    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } aN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
+  } vN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
+  } wide;
+
+  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
+
+  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
+
+#if 0
+  //
+  // SPILL TO GMEM
+  //
+#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
+  struct {
+
+#if (SKC_REGS_COLOR_S > 0)
+    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+#if (SKC_REGS_COVER_S > 0)
+    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+  } regs;
+#endif
+  //
+  //
+  //
+#endif
+};
+
+//
+//
+//
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+
+#define skc_subgroup_lane()  0
+
+#else
+
+#define skc_subgroup_lane()  get_sub_group_local_id()
+
+#endif
+
+//
+//
+//
+
+typedef skc_uint  skc_ttsk_lo_t;
+typedef skc_uint  skc_ttsk_hi_t;
+
+typedef skc_uint  skc_ttpk_lo_t;
+typedef skc_uint  skc_ttpk_hi_t;
+
+typedef skc_uint  skc_ttxk_lo_t;
+typedef skc_uint  skc_ttxk_hi_t;
+
+typedef skc_uint  skc_ttck_lo_t;
+typedef skc_uint  skc_ttck_hi_t;
+
+typedef skc_uint2 skc_ttck_t;
+
+typedef skc_int   skc_ttxb_t;
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+static
+skc_uint
+skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
+{
+  return a & SKC_TTCK_LO_MASK_ID;
+}
+
+static
+skc_layer_id
+skc_ttck_get_layer(skc_ttck_t const a)
+{
+  //
+  // FIXME -- a union with a ulong and a shift down and mask is
+  // probably faster on some architectures
+  //
+  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
+
+  return lo | hi;
+}
+
+static
+skc_uint
+skc_ttck_hi_get_x(skc_ttck_hi_t const a)
+{
+  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
+}
+
+static
+skc_uint
+skc_ttck_hi_get_y(skc_ttck_hi_t const a)
+{
+  return a >> SKC_TTCK_HI_OFFSET_Y;
+}
+
+static
+skc_bool
+skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
+{
+  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi ^ b.hi);
+
+  return (lo | hi) == 0;
+}
+
+static
+skc_bool
+skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
+{
+  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
+}
+
+static
+skc_bool
+skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
+{
+  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
+}
+
+//
+// TILE TRACE SUBPIXEL
+//
+// The subpixels are encoded with either absolute tile coordinates
+// (32-bits) or packed in delta-encoded form form.
+//
+// For 32-bit subpixel packing of a 32x32 tile:
+//
+// A tile X is encoded as:
+//
+//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
+//
+//   SX :  6 : unsigned subpixel span from min to max x with range
+//             [0,32]. The original direction is not captured. Would
+//             be nice to capture dx but not necessary right now but
+//             could be in the future. <--- SPARE VALUES AVAILABLE
+//
+// A tile Y is encoded as:
+//
+//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
+//
+//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
+//             [-32,32] but horizontal lines are not encoded so [1,32]
+//             is mapped to [0,31]. The resulting range [-32,31] fits
+//             in 6 bits.
+//
+// TTS:
+//
+//  0                        31
+//  |  TX |  SX  |  TY |  DY  |
+//  +-----+------+-----+------+
+//  |  10 |   6  |  10 |   6  |
+//
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // extract the whole pixel y coordinate
+  //
+  return SKC_BFE(a,
+                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
+                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // get the linear array tile index of the pixel
+  //
+  return (((a & SKC_TTS_MASK_TX_PIXEL)
+
+#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
+           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
+#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
+           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
+#endif
+
+           ) | skc_tts_get_ty_pixel_v(a));
+}
+
+#if 0
+static
+skc_ttx_v_s32_t
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
+
+  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
+}
+#else
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
+
+  return dy - (~a >> 31);
+}
+#endif
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
+{
+  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
+{
+  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
+{
+  //
+  // SIMD / CPU
+  //
+  //      &
+  //
+  // SIMT / GPU
+  //
+  // Note that atomic_init() is likely implemented as a simple
+  // assignment so there is no identifiable performance difference on
+  // current targets.
+  //
+  // If such an architecture appears in the future then we'll probably
+  // still want to implement this zero'ing operation as below but
+  // follow with an appropriate fence that occurs before any scatter
+  // operations.
+  //
+  // The baroque expansion below improves performance on Intel GEN by,
+  // presumably, achieving the 64-byte per clock SLM write as well as
+  // minimizing the overall number of SEND() block initializations and
+  // launches.
+  //
+  // Intel GENx has a documented 64 byte per cycle SLM write limit.
+  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
+  // probably a safe bet (Later: benchmarking backs this up!).
+  //
+  // Note there is no reason at this time to unroll this loop.
+  //
+  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
+    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// Note this is going to be vectorizable on most architectures.
+//
+// The return of the key translation feature might complicate things.
+//
+
+static
+void
+skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const pb_id)
+{
+  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+
+#if   ( SKC_TILE_RATIO == 1 )
+
+  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+
+  //
+  // Note there is no need to use an atomic for this operation on the
+  // current group of target platforms... but this may change if
+  // atomic ops truly go through a different path.
+  //
+  // As noted above, this direct increment is probably faster and can
+  // always be followed by a fence.
+  //
+  // Furthermore, note that the key sorting orders all ttck keys
+  // before ttpk keys.
+  //
+
+  //
+  // FIXME -- if the SMEM store is wider than bank word count then we
+  // might want to odd-even interleave the TTP values if the target
+  // device can't handle 64-bit stores
+  //
+
+  //
+  // skipping per-key translation for now
+  //
+  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
+}
+
+//
+// Note that skc_scatter_ttsb is *not* vectorizable unless the
+// architecture supports a "scatter-add" capability.  All relevant
+// GPUs support atomic add on shared/local memory and thus support
+// scatter-add.
+//
+
+static
+void
+skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const sb_id)
+{
+  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
+
+  //
+  // Skipping per-key translation for now
+  //
+
+  // Index into tile
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+
+  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
+
+#if 0
+  if (tts_v != SKC_TTS_INVALID)
+    printf("(%08X) = %u\n",tts_v,xy_idx);
+#endif
+
+  //
+  // adjust subpixel range to max y
+  //
+  // range is stored as [-32,31] and when read [0,31] is mapped to
+  // [1,32] because a dy of 0 is not possible.
+  //
+  // more succinctly: if dy >= 0 then ++dy
+  //
+  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
+
+  //
+  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
+  //
+
+  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
+  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
+
+  // Calculate left and right coverage contribution trapezoids
+  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
+  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
+
+  //
+  // Accumulate altitudes and areas
+  //
+  // Optimization: if the device supports an CPU/SIMD vector-add or
+  // GPU/SIMT scatter-add atomic int2 add operation then placing the
+  // ALT and AREA values side-by-side would halve the number of
+  // additions.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // CPU/SIMD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                 \
+  if (tts_v C != SKC_TTS_INVALID) {                             \
+    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
+    smem->aN.area[                  xy_idx C] += right C;       \
+  }
+
+#else
+  //
+  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  if (tts_v C != SKC_TTS_INVALID) {                                     \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
+                                          SKC_TILE_HEIGHT   + xy_idx C, \
+                                          left C);                      \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
+                                          right C);                     \
+  }
+#endif
+
+  SKC_RENDER_TTSB_EXPAND();
+}
+
+//
+// Note that 2048.0 can be represented exactly with fp16... fortuitous!
+//
+
+#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
+#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
+#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
+#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
+    }
+}
+
+static
+void
+skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                          uint                                 * SKC_RESTRICT const cmd_next,
+                          union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // rgba = solid fill
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = rg.hi;
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].a = ba.hi;
+
+#else
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
+
+  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
+
+  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
+
+#endif
+}
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices.  For example,
+// on GEN9 there is an LRP "linear interoplation" function but it
+// doesn't appear to support half floats.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)  mix(a,b,t)
+#endif
+
+//
+// CPUs have a mock local address space so copying the gradient header
+// is probably not useful.  Just read directly from global.
+//
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+#define SKC_RENDER_GRADIENT_SPACE  __local
+#else
+#define SKC_RENDER_GRADIENT_SPACE  __global
+#endif
+
+//
+// gradient is non-vertical
+//
+// removed the vertical (actually, horizontal) special case
+//
+
+static
+void
+skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
+                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                                                uint                                 * SKC_RESTRICT const cmd_next,
+                                                union skc_tile_color                 * SKC_RESTRICT const color,
+                                                skc_ttck_hi_t                                       const ttck_hi)
+{
+  //
+  // Where is this tile?
+  //
+  // Note that the gradient is being sampled from pixel centers.
+  //
+  SKC_RENDER_GRADIENT_FLOAT const y =
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
+    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
+    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
+
+  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
+
+  //
+  // Get starting numerator and denominator
+  //
+  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
+  // gradient and can be handled by a special opcode.
+  //
+  // Note: the mad() ordering is slightly different than the original
+  // CUDA implementation.
+  //
+  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
+
+  *cmd_next += 4;
+
+  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
+  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
+
+  //
+  // Where are columns along gradient vector?
+  //
+  // TODO: Note that the gv_denom isn't multiplied through.
+  //
+  // Please doublecheck this... but I recall that in certain cases
+  // this wipes out some precision and results in minor but noticeable
+  // gradient artifacts.
+  //
+  // All arguments are scalars except gv_numer so a simpler
+  // evaluation might save some flops.
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
+
+  //
+  // is gradient non-repeating, repeating or reflecting?
+  //
+  switch (commands[(*cmd_next)++].u32)
+    {
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
+      break;
+
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance -= floor(color->grad[ii].distance);
+      break;
+
+    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
+      //
+      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
+      //
+      // Note: OpenCL "rint()" is round-to-nearest-even integer!
+      //
+      // Note: the floor() "round to -inf" op is implemented in the
+      // GEN op 'FRC' so probably don't use trunc() when floor will
+      // suffice.
+      //
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        {
+          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
+          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
+        }
+    }
+
+  //
+  // initialize "stoplerp" for all columns
+  //
+  uint const slope_count = commands[(*cmd_next)++].u32;
+  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
+
+  {
+    float const slope = commands[(*cmd_next)++].f32;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
+  }
+
+  //
+  // compute stoplerp for remaining stops
+  //
+  for (int jj=1; jj<slope_count; jj++)
+    {
+      float const floor = (float)jj;
+      float const slope = commands[(*cmd_next)++].f32;
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
+    }
+
+  //
+  // copy gradient colors to local memory
+  //
+  uint const gd_n = slope_count + 1;
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+  //
+  // copy entire gradient descriptor to local memory
+  //
+  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
+    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
+
+  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
+#else
+  //
+  // prefetch entire gradient header
+  //
+  // no noticeable impact on performance
+  //
+  // prefetch(&commands[*cmd_next].u32,gh_words);
+  //
+  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
+#endif
+
+  //
+  // adjust cmd_next so that V1 structure is consumed -- FIXME
+  //
+  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
+
+  //
+  // lerp between color pair stops
+  //
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      //
+      // Finally, we have the gradient stop index and the color stop
+      // pair lerp fraction
+      //
+      // Note that if these are vector values then a gather operation
+      // must occur -- there may be platforms (AVX-512?) that can
+      // perform an explicit gather on a vector type but it's not
+      // really expressible in OpenCL except implicitly with a
+      // workgroup of work items.
+      //
+      // ***********************
+      //
+      // FIXME -- USE HERB'S SINGLE FMA LERP
+      //
+      // ***********************
+      //
+      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
+      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
+
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
+      }
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // fralunco = cover.wip * acc.a
+  //
+  // acc.r    =  fralunco * wip.r + acc.r
+  // acc.g    =  fralunco * wip.g + acc.g
+  // acc.b    =  fralunco * wip.b + acc.b
+  // acc.a    = -fralunco * wip.a + acc.a
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
+
+      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover_min = min(cover.wip,a.acc)
+  //
+  // r.acc =  cover_min * r.wip + r.acc
+  // g.acc =  cover_min * g.wip + g.acc
+  // b.acc =  cover_min * b.wip + b.acc
+  // a.acc = -cover_min * a.wip + a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
+
+      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // r.acc = (cover.wip * r.wip) * r.acc
+  // g.acc = (cover.wip * g.wip) * g.acc
+  // b.acc = (cover.wip * b.wip) * b.acc
+  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
+      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
+      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
+      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                        union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+  // r.acc =  cover.wip.contrib * r.wip + r.acc
+  // g.acc =  cover.wip.contrib * g.wip + g.acc
+  // b.acc =  cover.wip.contrib * b.wip + b.acc
+  // a.acc = -cover.wip.contrib * a.wip * a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
+
+      cover_acc->aN.c[ii]     += contrib;
+
+      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
+{
+  //
+  // cover.wip *= cover.msk
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1 - cover->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 1 - cover->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+static
+void
+skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
+{
+  //
+  // returns true if tile is opaque
+  //
+  // various hacks to test for complete tile opacity
+  //
+  // note that front-to-back currently has alpha at 0.0f -- this can
+  // be harmonized to use a traditional alpha if we want to support
+  // rendering in either direction
+  //
+  // hack -- ADD/MAX/OR all alphas together and test for non-zero
+  //
+  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
+    t += color->aN.rgba[ii].a;
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return !any(t != ( 0 ));
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+  //
+  // SIMT - scalar per lane
+  //
+  return !sub_group_any(t != 0);
+
+#else
+  //
+  // SIMT - vector per lane
+  //
+  return !sub_group_any(any(t != ( 0 )));
+
+#endif
+
+  //
+  // TODO: The alternative vector-per-lane implementation below is
+  // *not* believed to be performant because the terse vector-wide
+  // test is just hiding a series of comparisons and is likely worse
+  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
+  // test.
+  //
+#if 0
+  //
+  // SIMT - vector per lane
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    {
+      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
+        return false;
+    }
+
+  return true;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                         uint                                 * SKC_RESTRICT const cmd_next,
+                         union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // acc.r = acc.a * r + acc.r
+  // acc.g = acc.a * g + acc.g
+  // acc.b = acc.a * b + acc.b
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
+}
+
+//
+//
+//
+
+// #define SKC_SURFACE_IS_BUFFER
+#ifdef  SKC_SURFACE_IS_BUFFER
+
+static
+void
+skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
+                              skc_uint                                           const surface_pitch,
+                              union skc_tile_color          const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                      const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
+  uint const x     = skc_ttck_hi_get_x(ttck_hi);
+  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
+  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
+
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
+
+      surface[base + ii * pitch] = rgba;
+
+      // printf("%08v2X\n",rgba);
+    }
+}
+
+#else
+
+static
+void
+skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
+                              union skc_tile_color const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                   const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+
+#if 1
+  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x,y+I),         \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+        SKC_RENDER_SURFACE_COLOR const rgba =                   \
+          (SKC_RENDER_SURFACE_COLOR)                            \
+          (color->aN.rgba[ii].r C,                              \
+           color->aN.rgba[ii].g C,                              \
+           color->aN.rgba[ii].b C,                              \
+           1.0);                                                \
+        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
+      }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+      x += 1;
+    }
+#else
+    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x+I,y+ii),      \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+      SKC_RENDER_SURFACE_COLOR const rgba =                     \
+        (SKC_RENDER_SURFACE_COLOR)                              \
+        (color->aN.rgba[ii].r C,                                \
+        color->aN.rgba[ii].g C,                                 \
+        color->aN.rgba[ii].b C,                                 \
+        1.0);                                                   \
+      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
+    }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+    }
+
+#endif
+}
+
+#endif
+
+//
+//
+//
+static
+uint const
+skc_ttck_lane(uint const ttck_idx)
+{
+  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+}
+
+//
+// RENDER KERNEL
+//
+
+__kernel
+SKC_RENDER_KERNEL_ATTRIBS
+void
+skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
+                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
+                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
+
+                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
+                  skc_uint                                                const ttck_count,   // rename: key_count
+
+                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
+                  skc_uint                                                const tile_count,   // rename: offset_count
+
+                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
+#ifdef SKC_SURFACE_IS_BUFFER
+                  __global   void                          * SKC_RESTRICT const surface,
+#else
+                  __write_only image2d_t                                        surface,
+#endif
+#ifdef SKC_SURFACE_IS_BUFFER
+                  skc_uint                                                const surface_pitch,
+#endif
+                  uint4                                                   const tile_clip)    // rename: clip
+{
+  //
+  // Each subgroup is responsible for a tile.  No extra subgroups are
+  // launched.
+  //
+  // FIXME -- might be better implemented as a "grid stride loop" if
+  // Intel GEN really has a local memory "quantum" of 4KB which means
+  // we would need to launch 4 subgroups per workgroup.
+  //
+  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
+  //
+
+  //
+  // declare tile cover and color registers
+  //
+  // this used to be a neat unified struct but the Intel GEN compiler
+  // wasn't cooperating and spilling to private memory even though all
+  // registers were indexed by constants
+  //
+  union skc_tile_color  color_wip;
+  union skc_tile_color  color_acc;
+
+  union skc_tile_cover  cover_wip;
+  union skc_tile_cover  cover_acc;
+  union skc_tile_cover  cover_msk;
+
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
+  // as a uniform but the alternative calculation used when there are
+  // multiple subgroups per workgroup is not cooperating and
+  // driving spillage elsewhere.
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const ttck_offset_idx = get_group_id(0);
+#else
+  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // load the starting ttck for this offset and get a bound on the max
+  // number of keys that might be loaded
+  //
+  // these are uniform across all subgroup lanes
+  //
+  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
+
+  //
+  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
+  // vector of ttck keys
+  //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+
+  skc_ttck_t ttck = ttck_keys[ttck_idx];
+
+#else
+
+  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
+  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
+  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
+
+#endif
+
+  //
+  // set up style group/layer state
+  //
+  struct skc_styling_group {
+    union skc_group_range range;
+    skc_uint              depth;
+    skc_uint              id;
+  } group;
+
+  group.range.lo = 0;
+  group.range.hi = SKC_UINT_MAX;
+  group.depth    = 0;
+  group.id       = SKC_UINT_MAX;
+
+  //
+  // start with clear tile opacity, knockout and flag bits
+  //
+  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  //
+  skc_uint flags = 0;
+
+  //
+  // declare and initialize accumulators
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem                      smem[1];
+#else
+  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
+#endif
+
+#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+  //
+  // select the initial ttck key
+  //
+  skc_ttck_t ttck;
+#if 0
+  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
+  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
+#endif
+
+#endif
+
+  //
+  // save the first key so we know what tile we're in
+  //
+  skc_ttck_t ttck0 = ttck;
+
+  //
+  // evaluate the coarse clip as late as possible
+  //
+  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
+
+  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
+    return;
+
+  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
+
+  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
+    return;
+
+#if 0
+  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
+#endif
+
+  //
+  // load -> scatter -> flush
+  //
+  while (true)
+    {
+      // if scattering is disabled then just run through ttck keys
+      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
+
+      // need to clear accumulators before a scatter loop
+      if (is_scatter_enabled)
+        {
+          skc_tile_aa_zero(smem);
+        }
+
+      do {
+        // skip scattering?
+        if (is_scatter_enabled)
+          {
+            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
+
+            if (skc_ttck_lo_is_prefix(ttck.lo)) {
+              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
+            } else {
+              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
+            }
+          }
+
+        //
+        // any ttck keys left?
+        //
+        if (++ttck_idx >= ttck_count)
+          {
+            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+            break;
+          }
+
+        //
+        // process next ttck key
+        //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+        //
+        // SIMD -- read next key
+        //
+        ttck = ttck_keys[ttck_idx];
+#else
+        //
+        // SIMT -- refresh the ttck_s?
+        //
+        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+
+        if (ttck_lane_next == 0)
+          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
+
+        //
+        // broadcast next key to entire subgroup
+        //
+#if 0
+        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
+        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
+#endif
+#endif
+        // continue scattering if on same YXL layer
+      } while (skc_ttck_equal_yxl(ttck0,ttck));
+
+      // finalize if no longer on same YX tile
+      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
+        {
+          // otherwise, unwind the tile styling and exit
+          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+        }
+
+      //
+      // given: new layer id from ttxk key
+      //
+      // load [layer id]{ group id, depth }
+      //
+      // if within current group's layer range
+      //
+      //   if at same depth
+      //
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      //   else if not at same depth then move deeper
+      //
+      //     for all groups in group trail from cur depth to new depth
+      //       enter group, saving and initializing regs as necessary
+      //     increment depth and update layer range
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      // else not within layer range
+      //
+      //   exit current group, restoring regs as necessary
+      //   decrement depth and update layer range
+      //
+      //
+      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
+      union skc_layer_node const layer_node_new = layers[layer_id_new];
+
+      // clear flag that controls group/layer traversal
+      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
+
+      do {
+        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
+
+        //
+        // is layer a child of the current parent group?
+        //
+        uint cmd_next = 0;
+
+        if (!unwind && (layer_node_new.parent == group.id))
+          {
+            // execute this layer's cmds
+            cmd_next = layer_node_new.cmds;
+
+            // if this is final then configure so groups get unwound, otherwise we're done
+            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
+          }
+        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
+          {
+            //
+            // is layer in a child group?
+            //
+            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
+            uint                    const gn = gp.depth - ++group.depth;
+
+            if (gn == 0)
+              group.id = layer_node_new.parent;
+            else
+              group.id = commands[gp.base + gn - 1].parent;
+
+            // update group layer range
+            group.range = groups[group.id].range;
+
+            // enter current group
+            cmd_next    = groups[group.id].cmds.enter;
+          }
+        else // otherwise, exit this group
+          {
+            // enter current group
+            cmd_next = groups[group.id].cmds.leave;
+
+            // decrement group depth
+            if (--group.depth == 0)
+              {
+                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
+              }
+            else
+              {
+                // get path_base of current group
+                uint const gnpb = groups[group.id].parents.base;
+
+                // get parent of current group
+                group.id    = commands[gnpb].parent;
+
+                // update group layer range
+                group.range = groups[group.id].range;
+              }
+          }
+
+        //
+        // execute cmds
+        //
+        while (true)
+          {
+            union skc_styling_cmd const cmd = commands[cmd_next++];
+
+            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
+              {
+              case SKC_STYLING_OPCODE_NOOP:
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_NONZERO:
+                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_EVENODD:
+                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
+                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK:
+                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
+                skc_tile_cover_wip_zero(&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
+                skc_tile_cover_acc_zero(&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
+                skc_tile_cover_msk_zero(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
+                skc_tile_cover_msk_one(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
+                skc_tile_cover_msk_invert(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
+                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
+                //
+                // FIXME -- gradients shouldn't be executing so much
+                // conditional driven code at runtime since we *know*
+                // the gradient style on the host can just create a
+                // new styling command to exploit this.
+                //
+                // FIXME -- it might be time to try using the GPU's
+                // sampler on a linear array of half4 vectors -- it
+                // might outperform the explicit load/lerp routines.
+                //
+                // FIXME -- optimizing for vertical gradients (uhhh,
+                // they're actually horizontal due to the -90 degree
+                // view transform) is nice but is it worthwhile to
+                // have this in the kernel?  Easy to add it back...
+                //
+#if defined( SKC_ARCH_GEN9 )
+                // disable gradients due to exessive spillage -- fix later
+                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
+#else
+                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
+                skc_tile_color_wip_zero(&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
+                skc_tile_color_acc_zero(&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_OVER:
+                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_PLUS:
+                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
+                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
+                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
+                skc_tile_background_over(commands,&cmd_next,&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
+#ifdef SKC_SURFACE_IS_BUFFER
+                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
+#else
+                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
+                if (skc_tile_color_test_opacity(&color_acc))
+                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
+                break;
+
+              default:
+                return; // this is an illegal opcode -- trap and die!
+              }
+
+            //
+            // if sign bit is set then this was final command
+            //
+            if (cmd.s32 < 0)
+              break;
+          }
+
+        // continue as long as tile flush isn't complete
+      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
+
+      // return if was the final flush
+      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
+        return;
+
+      // update wip ttck_hi
+      ttck0 = ttck;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
new file mode 100644
index 0000000000..378d51d8d7
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "atomic_cl.h"
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+//
+//
+
+#define SKC_YX_NEQ(row,prev)                \
+  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
+                        __global uint                     * SKC_RESTRICT const indices,
+                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
+{
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = r1;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else if (get_sub_group_local_id() == 0) {
+    // if this is the first lane in the first slab
+    diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  //
+  // FIND ALL DIFFERENCES IN SLAB
+  //
+  uint valid = 0;
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= (SKC_YX_NEQ(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+  //
+  // SUM UP THE DIFFERENCES
+  //
+  uint const valid_diffs = valid & diffs;
+  uint const count       = popcount(valid_diffs);
+  uint const inclusive   = sub_group_scan_inclusive_add(count);
+  uint const exclusive   = inclusive - count;
+
+  //
+  // RESERVE SPACE IN THE INDICES ARRAY
+  //
+  uint next = 0;
+
+  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
+    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
+
+  // distribute base across subgroup
+  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
+
+  //
+  // STORE THE INDICES
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (valid_diffs & (1 << prev))                \
+    indices[next++] = lane_idx + prev;
+
+  HS_SLAB_ROWS();
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
new file mode 100644
index 0000000000..e9accde307
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "device_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+// THE BEST TYPE TO ZERO SMEM
+//
+
+#define SKC_ZERO_TYPE  ulong
+#define SKC_ZERO_WORDS 2
+
+//
+// THE ORDER OF COMPONENTS IS:
+//
+// 0: blocks
+// 1: offset
+// 2: pk
+// 3: rk
+//
+
+#if (HS_KEYS_PER_SLAB < 256)
+
+#define SKC_META_TYPE       uint
+#define SKC_META_WORDS      1
+
+#define SKC_COMPONENT_TYPE  uchar
+
+#else
+
+#define SKC_META_TYPE       uint2
+#define SKC_META_WORDS      2
+
+#define SKC_COMPONENT_TYPE  ushort
+
+#endif
+
+//
+//
+//
+
+#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
+#define SKC_COHORT_TYPE uchar
+#else
+#define SKC_COHORT_TYPE ushort
+#endif
+
+//
+//
+//
+
+#define SKC_COHORT_ID(row)                      \
+  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
+
+//
+// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
+//
+
+#define SKC_IS_BLOCK(row)                                               \
+  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+
+#define SKC_YX(row,prev)                        \
+  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
+
+#define SKC_IS_PK(row,prev)                             \
+  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
+
+//
+// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
+// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
+//
+// COHORT SIZE >= SUBGROUP SIZE
+//
+
+#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)
+
+#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
+#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
+#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+
+#define SKC_META_COMPONENTS       4
+#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+                        __global uint        * SKC_RESTRICT const metas)
+{
+  __local union
+  {
+    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
+    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
+    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = 0;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else {
+    // otherwise broadcast the first key in the first slab
+    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
+    // and mark it as an implicit diff
+    if (get_sub_group_local_id() == 0)
+      diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  // shift away y/x
+  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
+
+  //
+  // EXTRACT ALL COHORT IDS EARLY...
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                           \
+  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
+
+  HS_SLAB_ROWS();
+
+  //
+  // DEBUG
+  //
+#if 0
+  if (gmem_base == HS_KEYS_PER_SLAB * 7)
+    {
+      if (get_sub_group_local_id() == 0)
+        printf("\n%llX ",as_ulong(r0));
+      else
+        printf("%llX ",as_ulong(r0));
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+      if (get_sub_group_local_id() == 0)        \
+        printf("\n%llX ",r##row);               \
+      else                                      \
+        printf("%llX ",r##row);
+
+      HS_SLAB_ROWS();
+    }
+#endif
+
+  //
+  // CAPTURE ALL CONDITIONS WE CARE ABOUT
+  //
+  // Diffs must be captured before cohorts
+  //
+  uint            valid  = 0;
+  uint            blocks = 0;
+  uint            pks    = 0;
+  SKC_COHORT_TYPE c_max  = 0;
+
+  //
+  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
+  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
+  //
+#if 0
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= ((c##row != c##prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  blocks |= (SKC_IS_BLOCK(row) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  pks |= SKC_IS_PK(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#else
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (c##row != c##prev)                        \
+    diffs |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_BLOCK(row))                        \
+    blocks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_PK(row,prev))                      \
+    pks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (r##row != SKC_ULONG_MAX) {                \
+    valid |= 1<<prev;                           \
+    c_max  = max(c_max,c##row);                 \
+  }
+
+  HS_SLAB_ROWS();
+
+#endif
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+
+  // the min cohort is the first key in the slab
+  uint const c_min = sub_group_broadcast(c1,0);
+  
+  // the max cohort is the max across all lanes
+  c_max = sub_group_reduce_max(c_max);
+
+#if 0 // REMOVE ME LATER
+  if (get_sub_group_local_id() == 0)
+    printf("%3u : ( %3u , %3u )\n",
+           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+#endif
+
+  //
+  // ZERO SMEM
+  //
+  // zero only the meta info for the cohort ids found in this slab
+  //
+#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
+  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
+  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
+
+  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+    shared.z[zz] = 0;
+#else
+  // ERROR -- it's highly unlikely that the zero type is smaller than
+  // the meta type
+#error("Unsupported right now...")
+#endif
+
+  //
+  // ACCUMULATE AND STORE META INFO
+  //
+  uint const    valid_blocks = valid & blocks;
+  uint const    valid_pks    = valid & pks & ~diffs;
+  SKC_META_TYPE meta         = ( 0 );
+
+#define SKC_META_LOCAL_ADD(meta)                \
+  atomic_add(shared.m+HS_REG_LAST(c),meta);
+
+#define SKC_META_LOCAL_STORE(meta,prev)         \
+  shared.m[c##prev] = meta;
+
+  // note this is purposefully off by +1
+#define SKC_META_RESET(meta,curr)               \
+  meta = ((gmem_off + curr) << 8);
+
+#if 0
+
+  // FIXME -- this can be tweaked to shift directly
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  meta += ((((blocks >> prev) & 1)      ) |     \
+           (((pks    >> prev) & 1) << 16) |     \
+           (((rks    >> prev) & 1) << 24));
+
+#else
+
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  if (blocks & (1<<prev))                       \
+    meta += 1;                                  \
+  if (pks    & (1<<prev))                       \
+    meta += 1<<16;                              \
+  if (rks    & (1<<prev))                       \
+    meta += 1<<24;
+
+#endif
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (diffs & (1<<prev)) {                      \
+    SKC_META_LOCAL_STORE(meta,prev);            \
+    SKC_META_RESET(meta,row);                   \
+  }                                             \
+  SKC_META_ADD(meta,prev,                       \
+               valid_blocks,                    \
+               valid_pks,                       \
+               valid);
+
+  HS_SLAB_ROWS();
+
+  //
+  // ATOMICALLY ADD THE CARRIED OUT METAS
+  //
+#if 0 // BUG
+  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+    SKC_META_LOCAL_ADD(meta);
+#else
+  if (meta != 0)
+    SKC_META_LOCAL_ADD(meta);
+#endif
+
+  //
+  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
+  //
+
+  // convert the slab offset to an extent offset
+  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
+  uint const adjust    = is_offset ? gmem_base - 1 : 0;
+
+  //
+  // only process the meta components found in this slab
+  //
+  uint const cc_min = c_min * SKC_META_COMPONENTS;
+  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
+  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
+
+  if ((cc >= cc_min) && (cc <= cc_max))
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+
+  cc += HS_LANES_PER_WARP;
+
+  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
new file mode 100644
index 0000000000..e915dffada
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
@@ -0,0 +1,1443 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "common/cl/assert_cl.h"
+
+#include "context.h"
+#include "handle.h"
+#include "grid.h"
+#include "path.h"
+#include "path_builder.h"
+
+#include "config_cl.h"
+#include "export_cl_12.h"
+#include "runtime_cl_12.h"
+#include "path_builder_cl_12.h"
+
+//
+// OpenCL 1.2 devices support mapping of buffers into the host address
+// space.
+//
+// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
+// boundary (e.g. 128 bytes).  This complicates coordinating sharing
+// of data between the host and the device.
+//
+// Some OpenCL 2.0 devices support fine-grained shared virtual memory
+// pointers with byte-addressing and allow simpler coordination
+// strategies at the cost of maintaining cache coherency.
+//
+// The path builder is focused on moving bulk path data from the host
+// into the device-managed "block" memory pool and arranging it into a
+// SIMT/SIMD-friendly data structure that can be efficiently read by
+// the rasterizer.
+//
+// Note that one simplifying assumption is that the maximum length of
+// a *single* path can't be larger than what fits in the single extent
+// (which is split into M subbuffers).  This would be a very long path
+// and a legitimate size limitation.
+//
+// For some systems, it may be appropriate to never pull path data
+// into the device-managed block pool and instead present the path
+// data to the device in a temporarily available allocated memory
+// "zone" of paths that can be discarded all at once.
+//
+// For other systems, it may be appropriate to simply copy the path
+// data from host to device.
+//
+// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
+// targeting support basic map/unmap functionality similar to OpenCL
+// 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
+// sharing of memory and still require a map/unmap step... but note
+// that they all support byte-aligned mapping and subbuffers.
+//
+// The general strategy that this particular CL_12 implementation uses
+// is to allocate a large mappable bulk-data path buffer and an
+// auxilary mappable command buffer.
+//
+// The buffers are split into a reasonable number of properly aligned
+// subbuffers to enable simultaneous host and device access.
+//
+
+//
+// Blocks:
+//   1 extent
+//   M mapped subbuffers (configurable) to allow for concurrency
+//
+// Commands:
+//   1 extent
+//   M mapped subbuffers (configurable) to allow for concurrency
+//
+// Spans:
+//   M hi/lo structures
+//
+// { cl_sub, void*, event, base }
+//
+// - size of sub buffer
+// - remaining
+//
+// - counts
+//
+
+//
+// For any kernel launch, at most one path will be discontiguous and
+// defined across two sub-buffers.
+//
+// Nodes are updated locally until full and then stored so they will
+// never be incomplete.  Headers are stored locally until the path is
+// ended so they will never be incomplete.
+//
+// A line, quad or cubic acquires 4/6/8 segments which may be spread
+// across one or more congtiguous blocks.
+//
+// If a flush() occurs then the remaining columns of multi-segment
+// paths are initialized with zero-length line, quad, cubic elements.
+//
+// Every block's command word has a type and a count acquired from a
+// rolling counter.
+//
+// The kernel is passed two spans of blocks { base, count } to
+// process.  The grid is must process (lo.count + hi.count) blocks.
+//
+
+struct skc_subbuffer_blocks
+{
+  cl_mem   device;
+  void *   host;
+};
+
+struct skc_subbuffer_cmds
+{
+  cl_mem   device;
+  void *   host;
+  cl_event map;
+};
+
+//
+// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
+//
+
+typedef skc_uint skc_ringdex_t;
+
+union skc_ringdex_expand
+{
+  div_t      qr;
+
+  struct {
+#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
+    skc_uint subbuf;
+    skc_uint block;
+#else
+    skc_uint block;
+    skc_uint subbuf;
+#endif
+  };
+};
+
+//
+// this record is executed by the grid
+//
+
+struct skc_release_record
+{
+  struct skc_path_builder_impl * impl; // back pointer to impl
+
+  skc_grid_t                     grid; // pointer to scheduled grid
+
+  skc_uint                       from; // inclusive starting index   : [from,to)
+  skc_uint                       to;   // non-inclusive ending index : [from,to)
+};
+
+//
+//
+//
+
+struct skc_path_builder_impl
+{
+  struct skc_path_builder       * path_builder;
+
+  struct skc_runtime            * runtime;
+
+  cl_command_queue                cq;
+
+  struct {
+    cl_kernel                     alloc;
+    cl_kernel                     copy;
+  } kernels;
+
+  //
+  // FIXME -- make this pointer to constant config
+  //
+  // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+  struct {
+    skc_uint                      subbufs;  // how many subbufs in the buffer?
+
+    struct {
+      skc_uint                    buffer;   // how many blocks in the buffer?
+      skc_uint                    subbuf;   // how many blocks in a   subbuf?
+    } blocks_per;
+  } ring;
+  //
+  // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
+  //
+
+  struct {
+    cl_mem                        buffer;   // backing buffer for blocks
+    struct skc_subbuffer_blocks * subbufs;  // array of structures
+  } blocks;
+
+  struct {
+    cl_mem                        buffer;   // backing buffer for commands
+    struct skc_subbuffer_cmds   * subbufs;  // array of structures
+  } cmds;
+
+  struct {
+    struct skc_release_record   * records;  // max release records is equal to max subbufs
+    skc_path_t                  * paths;    // max paths is less than or equal to max commands
+  } release;
+
+  cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"
+
+  struct {
+    skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
+    skc_ringdex_t                 from;
+    skc_ringdex_t                 to;
+  } prev;
+
+  struct {
+    skc_ringdex_t                 from;
+    skc_ringdex_t                 to;
+  } curr;
+
+  struct {
+    struct skc_path_head        * head;     // pointer to local path header -- not written until path end
+    struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full
+
+    struct {
+      skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
+      union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
+      skc_uint                    rem;      // how many id slots left in node block
+    } ids;
+
+    struct {
+      skc_uint                    rem;      // how many subblocks left in block?
+      skc_uint                    rolling;  // rolling counter of block of subblocks
+      float                     * next;     // next subblock in current subblock block
+      skc_uint                    idx;      // index of next subblock
+    } subblocks;
+
+    struct {
+      skc_uint                    one;      // .block = 1
+      skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
+    } rolling;
+
+    skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
+  } wip;
+};
+
+//
+// FIXME -- move to a pow2 subbuffer size and dispense with division
+// and modulo operations
+//
+
+static
+union skc_ringdex_expand
+skc_ringdex_expand(struct skc_path_builder_impl * const impl, 
+                   skc_ringdex_t                  const ringdex)
+{
+  return (union skc_ringdex_expand){
+    .qr = div(ringdex,impl->ring.blocks_per.subbuf)
+  };
+}
+
+static
+void
+skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
+{
+  //
+  // FIXME - which is faster?
+  //
+#if 1
+  impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
+#else
+  impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
+#endif
+
+  // this path is too long -- for now assert() and die
+  assert(impl->wip.to != impl->curr.from);
+}
+
+static
+skc_ringdex_t
+skc_ringdex_span(struct skc_path_builder_impl * const impl,
+                 skc_ringdex_t                  const from,
+                 skc_ringdex_t                  const to)
+{
+  return (to - from) % impl->ring.blocks_per.buffer;
+}
+
+static
+void
+skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
+{
+  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+  // nothing to do if this is the first block in the subbuf
+  if (to.block == 0)
+    return;
+
+  skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
+
+  // otherwise increment and mod
+  impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
+}
+
+static
+skc_bool
+skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
+{
+  return impl->curr.from == impl->curr.to;
+}
+
+static
+skc_bool
+skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
+{
+  return impl->prev.from == impl->prev.to;
+}
+
+static
+skc_uint
+skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, 
+                          skc_uint                       const to_block)
+{
+  // no blocks acquired OR this is last block in subbuf
+  return !((impl->wip.to == impl->curr.to) || (to_block == 0));
+}
+
+//
+//
+//
+
+static
+struct skc_release_record *
+skc_release_curr(struct skc_path_builder_impl * const impl)
+{
+  union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
+
+  return impl->release.records + curr_from.subbuf;
+}
+
+//
+// FIXME -- get rid of all distant config references -- grab them at all at creation time
+//
+
+static
+void
+skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
+{
+  // init header counters // { handle, blocks, nodes, prims }
+  impl->wip.head->header = (union skc_path_header){
+    .handle = 0,
+    .blocks = 0,
+    .nodes  = 0,
+    .prims  = 0
+  };
+
+  // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
+  impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
+
+  // point wip ids at local head node
+  impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
+  impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
+
+  // start with no subblocks
+  impl->wip.subblocks.rem = 0;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
+{
+#if 1
+  //
+  // FIXME -- a Duff's device might be optimal here but would have to
+  // be customized per device since node's could be 16-128+ words
+  //
+  while (impl->wip.ids.rem > 0)
+    {
+      impl->wip.ids.rem      -= 1;
+      impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
+      impl->wip.ids.next     += 1;
+    }
+#else
+  memset(&impl->wip.ids.next->u32,
+         SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
+         sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
+
+  impl->wip.ids.next += impl->wip.ids.rem;
+  impl->wip.ids.rem   = 0;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_zero_float(skc_float * p, skc_uint rem)
+{
+  memset(p,0,sizeof(*p)*rem);
+}
+
+static
+void
+skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
+{
+  //
+  // FIXME -- it might be more performant to zero the remaining
+  // columns in a subblock -- a subblock at a time -- instead of the
+  // same column across all the subblocks
+  //
+#if 0
+  while (path_builder->line.rem > 0)
+    {
+      --path_builder->line.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+    }
+
+  while (path_builder->quad.rem > 0)
+    {
+      --path_builder->quad.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+      *path_builder->line.coords[4]++ = 0.0f;
+      *path_builder->line.coords[5]++ = 0.0f;
+    }
+
+  while (path_builder->cubic.rem > 0)
+    {
+      --path_builder->cubic.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+      *path_builder->line.coords[4]++ = 0.0f;
+      *path_builder->line.coords[5]++ = 0.0f;
+      *path_builder->line.coords[6]++ = 0.0f;
+      *path_builder->line.coords[7]++ = 0.0f;
+    }
+#else
+  if (path_builder->line.rem > 0)
+    {
+      skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
+
+      path_builder->line.rem = 0;
+    }
+
+  if (path_builder->quad.rem > 0)
+    {
+      skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
+
+      path_builder->quad.rem = 0;
+    }
+
+  if (path_builder->cubic.rem > 0)
+    {
+      skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
+
+      path_builder->cubic.rem = 0;
+    }
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
+                            skc_uint                             from,
+                            skc_uint                             to)
+{
+  // to might be out of range
+  to = to % impl->ring.subbufs;
+
+#if 0
+  fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
+#endif
+  
+  while (from != to) // 'to' might be out of range
+    {
+      // bring 'from' back in range
+      from = from % impl->ring.subbufs;
+
+      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
+
+      cl(EnqueueUnmapMemObject(impl->cq,
+                               blocks->device,
+                               blocks->host,
+                               0,NULL,NULL));
+
+      cl(EnqueueUnmapMemObject(impl->cq,
+                               cmds->device,
+                               cmds->host,
+                               0,NULL,NULL));
+
+      // bring from back in range
+      from = ++from % impl->ring.subbufs;
+    }
+}
+
+//
+// FIXME -- reuse this in create()
+//
+
+static
+void
+skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
+                          skc_uint                             from,
+                          skc_uint                             to)
+{
+  // to might be out of range
+  to = to % impl->ring.subbufs;
+
+#if 0
+  fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
+#endif
+
+  while (from != to)
+    {
+      cl_int cl_err;
+
+      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
+
+      blocks->host = clEnqueueMapBuffer(impl->cq,
+                                        blocks->device,
+                                        CL_FALSE,
+                                        CL_MAP_WRITE_INVALIDATE_REGION,
+                                        0,impl->runtime->config->paths_copy.block.subbuf,
+                                        0,NULL,NULL,
+                                        &cl_err); cl_ok(cl_err);
+
+      cl(ReleaseEvent(cmds->map));
+
+      cmds->host   = clEnqueueMapBuffer(impl->cq,
+                                        cmds->device,
+                                        CL_FALSE,
+                                        CL_MAP_WRITE_INVALIDATE_REGION,
+                                        0,impl->runtime->config->paths_copy.command.subbuf,
+                                        0,NULL,&cmds->map,
+                                        &cl_err); cl_ok(cl_err);
+
+      // bring from back in range
+      from = ++from % impl->ring.subbufs;
+    }
+  //
+  // FIXME -- when we switch to out of order queues we'll need a barrier here
+  //
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_release_dispose(struct skc_release_record    * const release,
+                                 struct skc_path_builder_impl * const impl)
+{
+  struct skc_runtime * runtime = impl->runtime;
+
+  if (release->from <= release->to) // no wrap
+    {
+      skc_path_t const * paths = impl->release.paths + release->from;
+      skc_uint           count = release->to         - release->from;
+
+      skc_grid_deps_unmap(runtime->deps,paths,count);
+      skc_runtime_path_device_release(runtime,paths,count);
+    }
+  else // from > to implies wrap
+    {
+      skc_path_t const * paths_lo = impl->release.paths + release->from;
+      skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;
+
+      skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
+      skc_runtime_path_device_release(runtime,paths_lo,count_lo);
+
+      skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
+      skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
+    }
+
+  release->to = release->from;
+}
+
+static
+void
+skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
+{
+  struct skc_release_record    * const release = skc_grid_get_data(grid);
+  struct skc_path_builder_impl * const impl    = release->impl;
+
+  skc_path_builder_release_dispose(release,impl);
+}
+
+static
+void
+// skc_path_builder_complete(struct skc_release_record * const release)
+skc_path_builder_complete(skc_grid_t grid)
+{
+  //
+  // notify deps that this grid is complete enough for other grids to
+  // proceed
+  //
+  // the path builder still has some cleanup to do before all its
+  // resources can be reused
+  //
+  skc_grid_complete(grid);
+}
+
+static
+void
+skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
+{
+  SKC_CL_CB(status);
+  
+  struct skc_release_record * const release = skc_grid_get_data(grid);
+  
+  SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
+{
+  struct skc_release_record    * const release = skc_grid_get_data(grid);
+  struct skc_path_builder_impl * const impl    = release->impl;
+
+  // 1. flush incomplete subblocks of path elements
+  // 2. unmap subbuffer on cq.unmap
+  // 3. flush cq.unmap
+  // 4. launch kernel on cq.kernel but wait for unmap completion
+  // 5. flush cq.kernel
+  // 6. remap relevant subbuffers on cq.map but wait for kernel completion
+  // 7. flush cq.map
+
+  //
+  // FIXME -- can be smarter about flushing if the wip paths are not
+  // in the same subbuf as curr.to
+  //
+  // THIS IS IMPORTANT TO FIX
+  //
+
+  // flush incomplete subblocks
+  skc_path_builder_finalize_subblocks(impl->path_builder);
+
+  //
+  // get range of subbufs that need to be unmapped
+  //
+  // note that impl->prev subbufs have already been unmapped
+  //
+  union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
+  union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
+  skc_uint                 const is_partial = curr_to.block > 0;
+  skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;
+
+  //
+  // unmap all subbufs in range [from,to)
+  //
+  skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
+
+  //
+  // launch kernels
+  //
+  skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
+  skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
+  skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;
+
+  //
+  // 1) allocate blocks from pool
+  //
+
+  //
+  // FIXME -- pack integers into struct/vector
+  //
+  cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
+  cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
+  cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
+  cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
+
+  skc_device_enqueue_kernel(impl->runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
+                            impl->cq,
+                            impl->kernels.alloc,
+                            1,
+                            0,NULL,NULL);
+
+  //
+  // 2) copy blocks from unmapped device-accessible memory
+  //
+
+  //
+  // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
+  //
+  cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
+
+  cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
+  cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+  cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
+
+  cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
+  cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
+
+  cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
+  cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
+
+  cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
+  cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
+
+  cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
+  cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
+  cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
+
+  cl_event complete;
+
+  skc_device_enqueue_kernel(impl->runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PATHS_COPY,
+                            impl->cq,
+                            impl->kernels.copy,
+                            pb_cmds,
+                            0,NULL,&complete);
+
+  // set a callback on completion
+  cl(SetEventCallback(complete,CL_COMPLETE,
+                      skc_path_builder_paths_copy_cb,
+                      grid));
+
+  // immediately release
+  cl(ReleaseEvent(complete));
+
+  //
+  // remap as many subbuffers as possible after the kernel completes
+  //
+  // note that remaps are async and enqueued on the same command queue
+  // as the kernel launch
+  //
+  // we can't remap subbuffers that are in the possibly empty range
+  //
+  // cases:
+  //
+  //   - curr.to == wip.to which means no blocks have been acquired
+  //   - curr.to points to first block in (next) subbuf
+  //   - otherwise, wip acquired blocks in the curr.to subbuf
+  //
+  // check for these first 2 cases!
+  //
+  union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
+  skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
+  skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);
+
+  // remap all subbufs in range [from,to)
+  skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
+
+  // flush command queue
+  cl(Flush(impl->cq));
+
+  // save rolling
+  impl->prev.rolling = impl->wip.rolling.next;
+
+  // update prev and curr
+  if (no_wip)
+    {
+      //
+      // if there was no wip then round up to the next subbuf
+      //
+      skc_ringdex_wip_to_subbuf_inc(impl);
+    
+      //
+      // update prev/curr with with incremented wip
+      //
+      impl->prev.from = impl->prev.to = impl->wip.to;
+      impl->curr.from = impl->curr.to = impl->wip.to;
+    }
+  else
+    {
+      //
+      // update prev with wip partials
+      //
+      impl->prev.from    = impl->curr.to;
+      impl->prev.to      = impl->wip .to;
+
+      //
+      // start curr on a new subbuf boundary
+      //
+      skc_ringdex_wip_to_subbuf_inc(impl);
+
+      impl->curr.from    = impl->wip.to;
+      impl->curr.to      = impl->wip.to;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
+                                        skc_uint                       const subbuf)
+{
+  //
+  // FIXME -- move to a power-of-two subbuf size and kickstart path
+  // copies as early as possible
+  //
+  // FIXME -- the subbufs "self-clock" (flow control) the kernel
+  // launches and accounting.  Combine all the subbuffers and release
+  // records into a single indexable struct instead of 3.
+  //
+  struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
+  struct skc_release_record * const release   = impl->release.records + subbuf;
+  struct skc_scheduler      * const scheduler = impl->runtime->scheduler;
+
+  // can't proceed until the paths have been released
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
+
+  // throw in a scheduler yield ... FIXME -- get rid of
+  skc_scheduler_yield(scheduler);
+
+  // can't proceed until the subbuffer is mapped
+  cl(WaitForEvents(1,&sc->map));
+}
+
+//
+//
+//
+
+static
+union skc_ringdex_expand
+skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
+{
+  // break ringdex into components
+  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+  // does wip ringdex point to a new subbuffer?
+  if (to.block == 0)
+    {
+      // potentially spin/block waiting for subbuffer
+      skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
+    }
+
+  // post increment wip.to
+  skc_ringdex_wip_to_block_inc(impl);
+
+  return to;
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_rolling_block(skc_uint const rolling, skc_uint const tag)
+{
+  return rolling | tag;
+}
+
+static
+skc_uint
+skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
+{
+  return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
+}
+
+static
+void
+skc_rolling_inc(struct skc_path_builder_impl * const impl)
+{
+  impl->wip.rolling.next += impl->wip.rolling.one;
+}
+
+//
+//
+//
+
+static
+void *
+skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
+                                  skc_uint                       const rolling,
+                                  skc_cmd_paths_copy_tag         const tag)
+{
+  // bump blocks count
+  impl->wip.head->header.blocks += 1;
+
+  // acquire a block
+  union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);
+
+  // make a pointer
+  union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
+
+  // store command for block
+  cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
+
+#if 0
+  // store command for block
+  cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
+
+  // increment rolling
+  skc_rolling_inc(impl);
+#endif
+
+  // return pointer to block
+  float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
+
+  // FIXME -- make it easier to get config constant
+  return blocks_subbuf + (to.block * impl->runtime->config->block.words);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
+{
+  // store command to subbuf and get pointer to blocks subbuf
+  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
+                                                         SKC_CMD_PATHS_COPY_TAG_NODE);
+
+  // copy head to blocks subbuf -- write-only
+  memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
+}
+
+static
+void
+skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
+{
+  // store command to subbuf and get pointer to blocks subbuf
+  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+                                                         SKC_CMD_PATHS_COPY_TAG_HEAD);
+
+  // copy head to blocks subbuf -- write-only
+  memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
+
+  // increment rolling
+  skc_rolling_inc(impl);
+
+  // the 'to' index is non-inclusive so assign wip.to after flush_head
+  impl->curr.to = impl->wip.to;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
+{
+  // update final block id in node
+  impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
+
+  // if wip.ids is not the header then flush now full wip node
+  if (impl->wip.head->header.nodes > 0)
+    skc_path_builder_impl_flush_node(impl);
+
+  // bump node count
+  impl->wip.head->header.nodes += 1;
+
+  // save current rolling
+  impl->wip.ids.rolling = impl->wip.rolling.next;
+
+  // increment rolling
+  skc_rolling_inc(impl);
+
+  // update wip.ids.*
+  impl->wip.ids.next = impl->wip.node->tag_ids;
+  impl->wip.ids.rem  = impl->runtime->config->block.words;
+}
+
+static
+void
+skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
+{
+  impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
+  impl->wip.subblocks.rolling = impl->wip.rolling.next;
+  impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+                                                                  SKC_CMD_PATHS_COPY_TAG_SEGS);
+  impl->wip.subblocks.idx     = 0;
+
+  // increment rolling
+  skc_rolling_inc(impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
+                                        skc_block_id_tag                     tag,
+                                        skc_uint                             vertices,
+                                        float * *                            subblocks)
+{
+  //
+  // FIRST TAG RECORDS THE ELEMENT TYPE
+  //
+  while (true)
+    {
+      // if only one block id left in node then acquire new node block
+      // and append its block id as with a next tag
+      if (impl->wip.ids.rem == 1)
+        skc_path_builder_impl_new_node_block(impl);
+
+      // if zero subblocks left then acquire a new subblock block and
+      // append its block id
+      if (impl->wip.subblocks.rem == 0)
+        skc_path_builder_impl_new_segs_block(impl);
+
+      // save first command -- tag and subblocks may have been updated
+      impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
+
+      // increment node block subblock pointer
+      impl->wip.ids.next += 1;
+      impl->wip.ids.rem  -= 1;
+
+      // how many vertices can we store
+      skc_uint rem = min(vertices,impl->wip.subblocks.rem);
+
+      // decrement vertices
+      vertices                -= rem;
+      impl->wip.subblocks.rem -= rem;
+      impl->wip.subblocks.idx += rem;
+
+      // assign subblocks
+      do {
+        *subblocks++              = impl->wip.subblocks.next;
+        impl->wip.subblocks.next += impl->runtime->config->subblock.words;
+        // FIXME -- move constants closer to structure
+      } while (--rem > 0);
+
+      // anything left to do?
+      if (vertices == 0)
+        break;
+
+      // any tag after this will be a caboose command
+      tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
+{
+  // finalize incomplete active subblocks -- we don't care about any
+  // remaining unused subblocks in block
+  skc_path_builder_finalize_subblocks(impl->path_builder);
+
+  // mark remaining wips.ids in the head or node as invalid
+  skc_path_builder_impl_finalize_node(impl);
+
+  // flush node if rem > 0 and node is not actually head
+  if (impl->wip.head->header.nodes >= 1)
+    skc_path_builder_impl_flush_node(impl);
+
+  // acquire path host id
+  *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
+
+  // save path host handle
+  impl->wip.head->header.handle = *path;
+
+  // flush head -- acquires a block and bumps head->header.blocks
+  skc_path_builder_impl_flush_head(impl);
+
+  // get current release
+  struct skc_release_record * const release = skc_release_curr(impl);
+
+  // acquire grid if null
+  if (release->grid == NULL)
+    {
+      release->grid =
+        SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                             &release->grid, // NULL on start/force
+                             release,        // data payload
+                             skc_path_builder_grid_pfn_waiting,
+                             NULL,           // no execute pfn
+                             skc_path_builder_grid_pfn_dispose);
+    }
+
+  // update grid map
+  skc_grid_map(release->grid,*path);
+
+  // update path release
+  impl->release.paths[release->to] = *path;
+
+  // increment release.to
+  release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
+
+  // add guard bit
+  *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
+
+#if 1
+  //
+  // eager kernel launch?
+  //
+  {
+    union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
+    union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);
+
+    if (curr_from.subbuf != curr_to.subbuf)
+      {
+        skc_grid_start(release->grid);
+        // skc_scheduler_yield(impl->runtime->scheduler);
+      }
+  }
+#endif
+}
+
+//
+// FIXME -- clean up accessing of CONFIG constants in these 3 routines
+//
+
+static
+void
+skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
+                                          impl->path_builder->line.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->line.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
+                                          impl->path_builder->quad.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
+                                          impl->path_builder->cubic.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
+{
+  // decrement reference count
+  if (--impl->path_builder->refcount != 0)
+    return;
+
+  //
+  // otherwise, dispose of everything
+  //
+  struct skc_runtime * const runtime = impl->runtime;
+
+  // free path builder
+  skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
+
+  // release cq
+  skc_runtime_release_cq_in_order(runtime,impl->cq);
+
+  // release kernels
+  cl(ReleaseKernel(impl->kernels.alloc));
+  cl(ReleaseKernel(impl->kernels.copy));
+
+  // free blocks extents
+  cl(ReleaseMemObject(impl->blocks.buffer));
+  skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
+
+  cl(ReleaseMemObject(impl->cmds.buffer));
+  skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
+
+  // free records
+  skc_runtime_host_perm_free(runtime,impl->release.records);
+  skc_runtime_host_perm_free(runtime,impl->release.paths);
+
+  // release staging head and node
+  skc_runtime_host_perm_free(runtime,impl->wip.head);
+  skc_runtime_host_perm_free(runtime,impl->wip.node);
+
+  // release reads scratch array
+  cl(ReleaseMemObject(impl->reads));
+
+  // for all subbuffers
+  //   unmap   subbuffer
+  //   release subbuffer
+  // printf("%s not releasing subbuffers\n",__func__);
+
+  skc_runtime_host_perm_free(impl->runtime,impl);
+}
+
+//
+//
+//
+
+skc_err
+skc_path_builder_cl_12_create(struct skc_context        * const context,
+                              struct skc_path_builder * * const path_builder)
+{
+  //
+  // retain the context
+  // skc_context_retain(context);
+  //
+  struct skc_runtime * const runtime = context->runtime;
+
+  // allocate path builder
+  (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
+
+  // init state
+  SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
+
+  (*path_builder)->context    = context;
+
+  // save opaque impl-specific pointers
+  (*path_builder)->begin      = skc_path_builder_pfn_begin;
+  (*path_builder)->end        = skc_path_builder_pfn_end;
+  (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
+  (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
+  (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
+  (*path_builder)->release    = skc_path_builder_pfn_release;
+
+  // initialize path builder counts
+  (*path_builder)->line.rem   = 0;
+  (*path_builder)->quad.rem   = 0;
+  (*path_builder)->cubic.rem  = 0;
+
+  (*path_builder)->refcount   = 1;
+
+  struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  (*path_builder)->impl       = impl;
+
+  //
+  // init impl
+  //
+  impl->path_builder  = *path_builder;
+  impl->runtime       = runtime;
+
+  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
+
+  impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
+  impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
+
+  //
+  // FIXME -- let these config constants remain constant and in place
+  //
+  struct skc_config const * const config = runtime->config;
+
+  impl->ring.subbufs           = config->paths_copy.buffer.count;
+  impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
+  impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
+  //
+  // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  //
+
+  cl_int cl_err;
+
+  // allocate large device-side extent for path data
+  impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                         config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // allocate small host-side array of pointers to mapped subbufs
+  impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->blocks.subbufs));
+
+  // allocate large device-side extent for path copy commands
+  impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                         config->paths_copy.command.buffer,
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // allocate small host-side array of pointers to mapped subbufs
+  impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->cmds.subbufs));
+
+  // allocate small host-side array of intervals of path handles
+  impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->release.records));
+
+  // allocate large host-side array that is max # of path handles in flight
+  impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.blocks_per.buffer *
+                                                      sizeof(*impl->release.paths));
+
+  // small scratch used by kernels
+  impl->reads           = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+                                         sizeof(skc_uint) * impl->ring.subbufs,
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // initialize release record with impl backpointer
+  for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
+    {
+      struct skc_release_record * record = impl->release.records + ii;
+
+      record->impl = impl;
+      record->grid = NULL;
+      record->from = record->to = ii * impl->ring.blocks_per.subbuf;
+    }
+
+  //
+  // allocate and map subbuffers -- we always check the command
+  // subbuffer's map/unmap events before touching it or its associated
+  // block subbuffer.
+  //
+  struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
+  struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;
+
+  cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
+  cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };
+
+  // for each subbuffer
+  for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
+    {
+      sb->device = clCreateSubBuffer(impl->blocks.buffer,
+                                     CL_MEM_HOST_WRITE_ONLY,
+                                     CL_BUFFER_CREATE_TYPE_REGION,
+                                     &rb,
+                                     &cl_err); cl_ok(cl_err);
+
+      sb->host   = clEnqueueMapBuffer(impl->cq,
+                                      sb->device,
+                                      CL_FALSE,
+                                      CL_MAP_WRITE_INVALIDATE_REGION,
+                                      0,rb.size,
+                                      0,NULL,NULL,
+                                      &cl_err); cl_ok(cl_err);
+
+      sc->device = clCreateSubBuffer(impl->cmds.buffer,
+                                     CL_MEM_HOST_WRITE_ONLY,
+                                     CL_BUFFER_CREATE_TYPE_REGION,
+                                     &rc,
+                                     &cl_err); cl_ok(cl_err);
+
+      sc->host   = clEnqueueMapBuffer(impl->cq,
+                                      sc->device,
+                                      CL_FALSE,
+                                      CL_MAP_WRITE_INVALIDATE_REGION,
+                                      0,rc.size,
+                                      0,NULL,&sc->map,
+                                      &cl_err); cl_ok(cl_err);
+      sb        += 1;
+      sc        += 1;
+
+      rb.origin += rb.size;
+      rc.origin += rc.size;
+    }
+
+  //
+  // initialize remaining members
+  //
+  impl->prev.from        = 0;
+  impl->prev.to          = 0;
+  impl->prev.rolling     = 0;
+
+  impl->curr.from        = 0;
+  impl->curr.to          = 0;
+
+  impl->wip.to           = 0;
+
+  impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+  impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+
+  impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
+  impl->wip.rolling.next = 0;
+
+  // for now, completely initialize builder before returning
+  cl(Finish(impl->cq));
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.h b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h
new file mode 100644
index 0000000000..20bb13cbdf
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef PATH_BUILDER_CL_12_ONCE
+#define PATH_BUILDER_CL_12_ONCE
+
+//
+//
+//
+
+#include "block.h"
+
+//
+// A tag type that fits into the block id tag bitfield
+//
+
+typedef enum skc_cmd_paths_copy_tag {
+
+  SKC_CMD_PATHS_COPY_TAG_SEGS,
+  SKC_CMD_PATHS_COPY_TAG_NODE,
+  SKC_CMD_PATHS_COPY_TAG_HEAD,
+
+  SKC_CMD_PATHS_COPY_TAG_COUNT
+
+} skc_cmd_paths_copy_tag;
+
+
+SKC_STATIC_ASSERT(SKC_CMD_PATHS_COPY_TAG_COUNT <= SKC_BLOCK_ID_TAG_COUNT);
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
new file mode 100644
index 0000000000..33992cbdfb
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
@@ -0,0 +1,1349 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+// get rid of these
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "hs/cl/hs_cl_launcher.h"
+
+#include "common/cl/assert_cl.h"
+
+#include "context.h"
+#include "grid.h"
+#include "raster.h"
+#include "extent_ring.h"
+#include "raster_builder.h"
+
+#include "tile.h"
+
+#include "config_cl.h"
+#include "runtime_cl_12.h"
+#include "extent_cl_12.h"
+#include "raster_builder_cl_12.h"
+
+//
+// RASTERIZATION SUB-PIPELINE
+// --------------------------
+//
+// Phase 1: expand commands
+//
+// Phase 2: rasterize
+//
+// Phase 3: sort & segment || release paths
+//
+// Phase 4: prefix
+//
+// Phase 5: release rasters
+//
+//                                                      RASTER  COHORT
+//                                                      ==============
+//
+//                      BUILDER                           RASTERIZER                              POST PROCESSING
+//   <----------------------------------------------->  <------------>  <--------------------------------------------------------------------->
+//
+//   fill cmds  transforms  raster clips  path release  rasterize cmds  cohort map  raster release  TTSB  TTSK  cohort atomics  context atomics
+//   ---------  ----------  ------------  ------------  --------------  ----------  --------------  ----  ----  --------------  ---------------
+//      1,2        1,2           1,2           1,2             2            1-4         1,2,3,4       2-4   2-4       2-4            global
+//
+//
+// NOTES: FINE-GRAINED SVM
+// -----------------------
+//
+//   1) In a fine-grained system we know the exact number of
+//      rasterize cmds per segment type before phase 1
+//
+//   2) A raster that's "under construction" shouldn't be rasterized
+//      until it is complete.  This implies that a raster is not part
+//      of a cohort until it is complete.  The raster builder must
+//      handle raster promises being "forced" to completion -- this is
+//      likely the result of composition construction and subsequent
+//      rendering to a surface.
+//
+//   3) The raster cohort rasterizer state retains the fill cmd,
+//      transform, raster clip and path release "ring" extents.
+//
+//   4) The rasterize cmd extent sizes (line, quad, cubic, rational
+//      quad, rational cubic) are known ahead of time.
+//
+//   5) The raster cohort post processor is standalone and retains the
+//      raster_map, cohort atomics, TTSK_RYX extent, and raster
+//      references until complete.
+//
+
+//
+// Notes:
+//
+// - Could have a pipeline stage before expansion count the exact
+//   number of line/quad/cubic commands but the command buffers are
+//   relatively small (64-bit commands * # of path segments).
+//
+
+//                          raster
+//                          cohort atomics path_ids raster_ids transforms clips cmds_fill cmds_l/q/c ttsk_ryx
+//
+//
+// BEGIN                      ^
+//                            |
+//   EXPAND                   |
+//                            |
+//   RASTERIZE                |
+//                            |
+//   SORT || RELEASE PATHS    |
+//                            |
+//   PREFIX                   |
+//                            |
+//   RELEASE RASTERS          |
+//                            |
+// END                        v
+//
+//
+// BEGIN
+//
+//   EXPAND                   -- PRODUCES:   one or more extents of rasterization commands
+//
+//   RASTERIZE                -- DEPENDENCY: requires size of command extents before launching
+//                            -- PRODUCES:   an extent of ttsk_ryx keys
+//
+//   SORT || RELEASE PATHS    -- DEPENDENCY: requires size of key extent before launching
+//                            -- PRODUCES:   sorted array of keys
+//
+//   PREFIX                   -- DEPENDENCY: none -- can execute after SORT because grid size is number of rasters
+//
+//   RELEASE RASTERS          -- DEPENDENCY: none -- can execute after prefix
+//
+// END
+//
+
+// ------------------------
+//
+// DEPENDENCY is cleanly implemented with a host callback or device kernel launcher
+//
+// Can this hide resource acquisition?  Yes.  But there are two cases:
+//
+// 1. acqusition of resources occurs on the host thread and lack of
+//    resources drains the host command queue until resources are
+//    available (OpenCL 2.x)
+//
+// 2. the host commands lazily acquire resources (OpenCL 1.2)
+//
+// ------------------------
+//
+// How to express?
+//
+// Each substage launches its successors.  This supports both dependency models.
+//
+// If OpenCL 1.2 then the substage can't be launched until the prior
+// stage's event is complete.  So this requires registering a callback
+// to invoke the substage.
+//
+// ------------------------
+
+//
+// BUILD
+//
+
+struct skc_raster_builder_impl
+{
+  struct skc_raster_builder    * raster_builder;
+  struct skc_runtime           * runtime;
+
+  skc_grid_t                     cohort;
+
+  // these are all durable/perm extents
+  struct skc_extent_phrwg_thr1s  path_ids;    // read/write by host
+  struct skc_extent_phw1g_tdrNs  transforms;  // write once by host + read by device
+  struct skc_extent_phw1g_tdrNs  clips;       // write once by host + read by device
+  struct skc_extent_phw1g_tdrNs  fill_cmds;   // write once by host + read by device
+  struct skc_extent_phrwg_tdrNs  raster_ids;  // read/write by host + read by device
+
+  struct {
+    cl_kernel                    fills_expand;
+    cl_kernel                    rasterize_all;
+    cl_kernel                    segment;
+    cl_kernel                    rasters_alloc;
+    cl_kernel                    prefix;
+  } kernels;
+};
+
+//
+// RASTER COHORT
+//
+// This sub-pipeline snapshots the raster builder and then acquires
+// and releases host and device resources as necessary (as late as
+// possible).
+//
+// Note that the cohort extents are ephemeral and are only used by one
+// or more stages of a the rasterization sub-pipeline.
+//
+// The pipeline implementation may vary between compute platforms.
+//
+
+struct skc_raster_cohort
+{
+  struct skc_raster_builder_impl    * impl;
+
+  struct skc_extent_phrwg_thr1s_snap  path_ids;    // read/write by host
+  struct skc_extent_phw1g_tdrNs_snap  transforms;  // write once by host + read by device
+  struct skc_extent_phw1g_tdrNs_snap  clips;       // write once by host + read by device
+  struct skc_extent_phw1g_tdrNs_snap  fill_cmds;   // write once by host + read by device
+  struct skc_extent_phrwg_tdrNs_snap  raster_ids;  // read/write by host + read by device
+
+  cl_command_queue                    cq;
+
+  // sub-pipeline atomics
+  struct skc_extent_thr_tdrw          atomics;
+
+  // path primitives are expanded into line/quad/cubic/rational cmds
+  struct skc_extent_tdrw              cmds;
+
+  // rasterization output
+  struct skc_extent_tdrw              keys;
+  // struct skc_extent_thrw_tdrw      keys;
+
+  // post-sort extent with metadata for each raster
+  struct skc_extent_tdrw              metas;
+  // struct skc_extent_thrw_tdrw      metas;
+
+  // subbuf id
+  skc_subbuf_id_t                     id;
+
+  //
+  // pipeline also uses the following global resources:
+  //
+  // - command queue from global factory
+  // - global block pool and its atomics
+  // - global path and raster host id map
+  // - temporary host and device allocations
+  //
+};
+
+//
+// TTRK (64-BIT COMPARE)
+//
+//    0                                  63
+//    | TTSB ID |   X  |   Y  | COHORT ID |
+//    +---------+------+------+-----------+
+//    |    27   |  12  |  12  |     13    |
+//
+//
+// TTRK (32-BIT COMPARE)
+//
+//    0                                        63
+//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
+//    +---------+-----+------+------+-----------+
+//    |    27   |  5  |  12  |  12  |     8     |
+//
+
+//
+// TTRK is sortable intermediate key format for TTSK
+//
+// We're going to use the 32-bit comparison version for now
+//
+
+union skc_ttrk
+{
+  skc_ulong  u64;
+  skc_uint2  u32v2;
+
+  struct {
+    skc_uint block    : SKC_TTXK_LO_BITS_ID;
+    skc_uint na0      : SKC_TTRK_LO_BITS_NA;
+    skc_uint x        : SKC_TTXK_HI_BITS_X;
+    skc_uint y        : SKC_TTXK_HI_BITS_Y;
+    skc_uint cohort   : SKC_TTRK_HI_BITS_COHORT;
+  };
+
+  struct {
+    skc_uint na1;
+    skc_uint yx       : SKC_TTXK_HI_BITS_YX;
+    skc_uint na2      : SKC_TTRK_HI_BITS_COHORT;
+  };
+
+  struct {
+    skc_uint na3;
+    skc_uint na4      : SKC_TTXK_HI_BITS_X;
+    skc_uint cohort_y : SKC_TTRK_HI_BITS_COHORT_Y;
+  };
+};
+
+//
+//
+//
+
+static
+void
+skc_raster_builder_pfn_release(struct skc_raster_builder_impl * const impl)
+{
+  // decrement reference count
+  if (--impl->raster_builder->refcount != 0)
+    return;
+
+  //
+  // otherwise, dispose of the the raster builder and its impl
+  //
+  struct skc_runtime * const runtime = impl->runtime;
+
+  // free the raster builder
+  skc_runtime_host_perm_free(runtime,impl->raster_builder);
+
+  // free durable/perm extents
+  skc_extent_phrwg_thr1s_free(runtime,&impl->path_ids);
+  skc_extent_phw1g_tdrNs_free(runtime,&impl->transforms);
+  skc_extent_phw1g_tdrNs_free(runtime,&impl->clips);
+  skc_extent_phw1g_tdrNs_free(runtime,&impl->fill_cmds);
+  skc_extent_phrwg_tdrNs_free(runtime,&impl->raster_ids);
+
+  // release kernels
+  cl(ReleaseKernel(impl->kernels.fills_expand));
+  cl(ReleaseKernel(impl->kernels.rasterize_all));
+
+#if 0
+  cl(ReleaseKernel(impl->kernels.rasterize_lines));
+  cl(ReleaseKernel(impl->kernels.rasterize_quads));
+  cl(ReleaseKernel(impl->kernels.rasterize_cubics));
+#endif
+
+  cl(ReleaseKernel(impl->kernels.segment));
+  cl(ReleaseKernel(impl->kernels.rasters_alloc));
+  cl(ReleaseKernel(impl->kernels.prefix));
+
+  // free the impl
+  skc_runtime_host_perm_free(runtime,impl);
+}
+
+//
+//
+//
+
+static 
+void
+skc_raster_builder_rasters_release(struct skc_runtime * const runtime,
+                                   skc_raster_t const * const rasters,
+                                   skc_uint             const size,
+                                   skc_uint             const from,
+                                   skc_uint             const to)
+{
+  if (from <= to) // no wrap
+    {
+      skc_raster_t const * rasters_from = rasters + from;
+      skc_uint             count_from   = to      - from;
+
+      skc_grid_deps_unmap(runtime->deps,rasters_from,count_from);
+      skc_runtime_raster_device_release(runtime,rasters_from,count_from);
+    } 
+  else // from > to implies wrap
+    {
+      skc_raster_t const * rasters_lo = rasters + from;
+      skc_uint             count_lo   = size    - from;
+      
+      skc_grid_deps_unmap(runtime->deps,rasters_lo,count_lo);
+      skc_runtime_raster_device_release(runtime,rasters_lo,count_lo);
+
+      skc_grid_deps_unmap(runtime->deps,rasters,to);
+      skc_runtime_raster_device_release(runtime,rasters,to);
+    }
+}
+
+static 
+void
+skc_raster_builder_paths_release(struct skc_runtime                 * const runtime,
+                                 struct skc_extent_phrwg_thr1s_snap * const snap)
+{
+  // release lo
+  skc_runtime_path_device_release(runtime,snap->hr1.lo,snap->count.lo);
+
+  // release hi
+  if (snap->count.hi)
+    skc_runtime_path_device_release(runtime,snap->hr1.hi,snap->count.hi);
+}
+
+static
+void
+skc_raster_builder_cohort_grid_pfn_dispose(skc_grid_t const grid)
+{
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          -
+  // raster_ids        a
+  // transforms        -
+  // clips             -
+  // fill_cmds         -
+  // cq                a
+  // cohort atomics    a
+  // cmds              -
+  // keys              a
+  // meta              a
+  //
+
+  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
+  struct skc_raster_builder_impl * const impl    = cohort->impl;
+  struct skc_runtime             * const runtime = impl->runtime;
+
+  //
+  // release paths -- FIXME -- Note that releasing paths can be
+  // performed after rasterization is complete
+  //
+
+  // snap alloc the paths -- this host snap simply sets up pointers
+  skc_extent_phrwg_thr1s_snap_alloc(runtime,&impl->path_ids,&cohort->path_ids);
+  
+  // unmap and release raster ids
+  skc_raster_builder_paths_release(runtime,&cohort->path_ids);
+
+  // release path ids
+  skc_extent_phrwg_thr1s_snap_free(runtime,&cohort->path_ids);
+
+  //
+  // release rasters
+  //
+  skc_uint const size = cohort->raster_ids.snap->ring->size.pow2;
+  skc_uint const from = skc_extent_ring_snap_from(cohort->raster_ids.snap);
+  skc_uint const to   = skc_extent_ring_snap_to(cohort->raster_ids.snap);
+
+  // unmap and release raster ids
+  skc_raster_builder_rasters_release(runtime,impl->raster_ids.hrw,size,from,to);
+
+  // release cohort's remaining allocated resources 
+  skc_extent_phrwg_tdrNs_snap_free(runtime,&cohort->raster_ids);
+  skc_runtime_release_cq_in_order(runtime,cohort->cq);
+  skc_extent_thr_tdrw_free(runtime,&cohort->atomics);
+  skc_extent_tdrw_free(runtime,&cohort->keys);
+  skc_extent_tdrw_free(runtime,&cohort->metas);
+  // skc_extent_thrw_tdrw_free(runtime,&cohort->keys);
+  // skc_extent_thrw_tdrw_free(runtime,&cohort->metas);
+  skc_runtime_host_temp_free(runtime,cohort,cohort->id);
+
+  // release the raster builder
+  skc_raster_builder_pfn_release(impl);
+
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          -
+  // raster_ids        -
+  // transforms        -
+  // clips             -
+  // fill_cmds         -
+  // cq                -
+  // cohort atomics    -
+  // cmds              -
+  // keys              -
+  // meta              -
+  //
+}
+
+//
+//
+//
+
+static
+void
+skc_raster_cohort_prefix_release(skc_grid_t const grid)
+{
+  // FIXME -- note that pfn_dispose can be accomplished here
+
+  // release the grid
+  skc_grid_complete(grid); 
+}
+
+static
+void
+skc_raster_cohort_prefix_cb(cl_event event, cl_int status, skc_grid_t const grid)
+{
+  SKC_CL_CB(status);
+  
+  struct skc_raster_cohort * const cohort    = skc_grid_get_data(grid);
+  struct skc_scheduler     * const scheduler = cohort->impl->runtime->scheduler;
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(scheduler,skc_raster_cohort_prefix_release,grid);
+}
+
+//
+//
+//
+
+#if 0
+static
+int cmp64(const void * ptr_a, const void * ptr_b)
+{
+  skc_ulong const a = *(const skc_ulong *)ptr_a;
+  skc_ulong const b = *(const skc_ulong *)ptr_b;
+
+  if (a < b) return -1;
+  if (a > b) return +1;
+  else       return  0;
+}
+#endif
+
+//
+//
+//
+
+static
+void
+skc_raster_cohort_sort_prefix(skc_grid_t const grid)
+{
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        a
+  // clips             a
+  // fill_cmds         -
+  // cq                a
+  // cohort atomics    a
+  // cmds              a
+  // keys              a
+  // meta              -
+  //
+
+  // use the backpointers
+  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
+  struct skc_raster_builder_impl * const impl    = cohort->impl;
+  struct skc_runtime             * const runtime = impl->runtime;
+
+  // release transforms
+  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->transforms);
+
+  // release clips
+  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->clips);
+
+  // release expanded cmds
+  skc_extent_tdrw_free(runtime,&cohort->cmds);
+
+  // alloc the snapshost -- could be zero-sized
+  skc_extent_phrwg_tdrNs_snap_alloc(runtime,
+                                    &impl->raster_ids,
+                                    &cohort->raster_ids,
+                                    cohort->cq,NULL);
+
+  // will never be zero
+  skc_uint const rasters = skc_extent_ring_snap_count(cohort->raster_ids.snap);
+
+  // acquire fixed-size device-side extent
+  skc_extent_tdrw_alloc(runtime,
+                        &cohort->metas,
+                        sizeof(struct skc_raster_cohort_meta));
+
+  // skc_extent_thrw_tdrw_alloc(runtime,
+  //                            &cohort->metas,
+  //                            sizeof(struct skc_raster_cohort_meta));
+
+  // zero the metas
+  skc_extent_tdrw_zero(&cohort->metas,cohort->cq,NULL);
+
+  // get the read-only host copy of the device atomics
+  struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr;
+
+  //
+  // SORT
+  //
+  if (atomics->keys > 0)
+    {
+#ifndef NDEBUG
+      fprintf(stderr,"raster cohort sort: %u\n",atomics->keys);
+#endif
+
+      //
+      //
+      //
+      uint32_t keys_padded_in, keys_padded_out;
+
+      hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
+
+      hs_sort(cohort->cq,
+              cohort->keys.drw,
+              cohort->keys.drw,
+              atomics->keys,
+              keys_padded_in,
+              keys_padded_out,
+              false);
+
+      cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw)));
+      cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw)));
+
+#ifndef NDEBUG
+      fprintf(stderr,"post-sort\n");
+#endif
+
+      // find start of each tile
+      skc_device_enqueue_kernel(runtime->device,
+                                SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK,
+                                cohort->cq,
+                                impl->kernels.segment,
+                                atomics->keys,
+                                0,NULL,NULL);
+
+#ifndef NDEBUG
+      fprintf(stderr,"post-segment\n");
+#endif
+
+      //
+      // DELETE ALL THIS WHEN READY
+      //
+
+#if 0    
+      //
+      //
+      //
+      cl(Finish(cohort->cq));
+
+      // map keys to host
+      union skc_ttrk * const keys = skc_extent_thrw_tdrw_map(&cohort->keys,
+                                                             cohort->cq,
+                                                             NULL);
+      // map meta to host
+      struct skc_raster_cohort_meta * const metas = skc_extent_thrw_tdrw_map(&cohort->metas,
+                                                                             cohort->cq,
+                                                                             NULL);
+      // block until done
+      cl(Finish(cohort->cq));
+
+      // sort keys
+      qsort(keys,atomics->keys,sizeof(*keys),cmp64);
+
+      // mask to determine if rk id is a new block
+      skc_uint const subblock_mask = runtime->config->block.subblocks - 1;
+
+      //
+      // some counters
+      //
+      union skc_raster_cohort_meta_in meta_in = { 
+        .blocks = 0,
+        .offset = 0,
+        .pk     = 0,
+        .rk     = 0
+      };
+
+      // get first key
+      union skc_ttrk curr = keys[0];
+
+      skc_uint ii=0, jj=0;
+
+      // for all TTRK keys
+      while (true)
+        {
+          // increment ttrk count
+          meta_in.rk += 1;
+
+          // was this a new block?
+          if ((curr.u32v2.lo & subblock_mask) == 0)
+            meta_in.blocks += 1;
+
+          // break if we're out of keys
+          if (++ii >= atomics->keys)
+            break;
+
+          // otherwise, process next key
+          union skc_ttrk const next = keys[ii];
+
+          // if new cohort then save curr meta and init next meta
+          if (next.cohort != curr.cohort)
+            {
+              fprintf(stderr,"[ %u, %u, %u, %u ]\n",
+                      meta_in.blocks,
+                      meta_in.offset,
+                      meta_in.pk,
+                      meta_in.rk);
+
+              // store back to buffer
+              metas->inout[curr.cohort].in = meta_in;
+            
+              // update meta_in
+              meta_in.blocks = 0;
+              meta_in.offset = ii; 
+              meta_in.pk     = 0;
+              meta_in.rk     = 0;
+            }
+          // otherwise, if same y but new x then increment TTPK count
+          else if ((next.y == curr.y) && (next.x != curr.x))
+            {
+              meta_in.pk += 1;
+
+#if 0
+              fprintf(stderr,"%3u : %3u : ( %3u, %3u ) -> ( %3u )\n",
+                      jj++,curr.cohort,curr.y,curr.x,next.x);
+#endif
+            }
+
+#if 0
+          fprintf(stderr,"( %3u, %3u )\n",next.y,next.x);
+#endif
+
+          curr = next;
+        }
+
+      fprintf(stderr,"[ %u, %u, %u, %u ]\n",
+              meta_in.blocks,
+              meta_in.offset,
+              meta_in.pk,
+              meta_in.rk);
+
+      // store back to buffer
+      metas->inout[curr.cohort].in = meta_in;
+
+    
+      // unmap
+      skc_extent_thrw_tdrw_unmap(&cohort->keys,
+                                 keys,
+                                 cohort->cq,
+                                 NULL);
+
+      // unmap
+      skc_extent_thrw_tdrw_unmap(&cohort->metas,
+                                 metas,
+                                 cohort->cq,
+                                 NULL);
+#endif
+    }
+
+#ifndef NDEBUG
+  fprintf(stderr,"rasters_alloc: %u\n",rasters);
+#endif
+
+  //
+  // RASTER ALLOC/INIT
+  //
+  cl(SetKernelArg(impl->kernels.rasters_alloc,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,1,SKC_CL_ARG(runtime->block_pool.ids.drw)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,2,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,3,SKC_CL_ARG(runtime->handle_pool.map.drw)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,4,SKC_CL_ARG(cohort->metas.drw)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,5,SKC_CL_ARG(cohort->raster_ids.drN)));
+  cl(SetKernelArg(impl->kernels.rasters_alloc,6,SKC_CL_ARG(rasters)));
+
+  skc_device_enqueue_kernel(runtime->device,
+                            SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC,
+                            cohort->cq,
+                            impl->kernels.rasters_alloc,
+                            rasters,
+                            0,NULL,NULL);
+
+#ifndef NDEBUG
+  fprintf(stderr,"post-alloc\n");
+#endif
+
+  //
+  // PREFIX
+  //
+  cl(SetKernelArg(impl->kernels.prefix,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
+  cl(SetKernelArg(impl->kernels.prefix,1,SKC_CL_ARG(runtime->block_pool.ids.drw)));
+  cl(SetKernelArg(impl->kernels.prefix,2,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
+  cl(SetKernelArg(impl->kernels.prefix,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
+
+  cl(SetKernelArg(impl->kernels.prefix,4,SKC_CL_ARG(cohort->keys.drw)));
+  cl(SetKernelArg(impl->kernels.prefix,5,SKC_CL_ARG(runtime->handle_pool.map.drw)));
+
+  cl(SetKernelArg(impl->kernels.prefix,6,SKC_CL_ARG(cohort->metas.drw)));
+  cl(SetKernelArg(impl->kernels.prefix,7,SKC_CL_ARG(rasters)));
+
+  cl_event complete;
+
+  skc_device_enqueue_kernel(runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PREFIX,
+                            cohort->cq,
+                            impl->kernels.prefix,
+                            rasters,
+                            0,NULL,
+                            &complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_prefix_cb,grid));
+  cl(ReleaseEvent(complete));
+
+#ifndef NDEBUG
+  fprintf(stderr,"post-prefix\n");
+#endif
+
+  // flush command queue
+  cl(Flush(cohort->cq));
+
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          a
+  // raster_ids        a
+  // transforms        -
+  // clips             -
+  // fill_cmds         -
+  // cq                a
+  // cohort atomics    a
+  // cmds              -
+  // keys              a
+  // meta              a
+  //
+}
+
+static
+void
+skc_raster_cohort_rasterize_cb(cl_event event, cl_int status, skc_grid_t const grid)
+{
+  SKC_CL_CB(status);
+  
+  struct skc_raster_cohort * const cohort = skc_grid_get_data(grid);
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_sort_prefix,grid);
+}
+
+static
+void
+skc_raster_cohort_rasterize(skc_grid_t const grid)
+{
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        i
+  // clips             i
+  // fill_cmds         s
+  // cq                a
+  // cohort atomics    a
+  // cmds              a
+  // cmds_quad         a
+  // cmds_cubic        a
+  // keys              -
+  // meta              -
+
+  // use the backpointers
+  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
+  struct skc_raster_builder_impl * const impl    = cohort->impl;
+  struct skc_runtime             * const runtime = impl->runtime;
+
+  //
+  // RELEASED RESOURCES
+  //
+  // cmds       snap
+  //
+
+  // release the cmds extent and snap since it's only used by the expand stage
+  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->fill_cmds);
+
+  //
+  // NEW ALLOCATED RESOURCES
+  //
+  // transforms snap
+  // clips snap
+  // ttrk keys
+  //
+  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
+                                    &impl->transforms,
+                                    &cohort->transforms,
+                                    cohort->cq,NULL);
+
+  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
+                                    &impl->clips,
+                                    &cohort->clips,
+                                    cohort->cq,NULL);
+
+  // acquire device-side extent
+  skc_extent_tdrw_alloc(runtime,
+                        &cohort->keys,
+                        sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys);
+
+  // skc_extent_thrw_tdrw_alloc(runtime,
+  //                            &cohort->keys,
+  //                            sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys);
+
+  //
+  // acquire out-of-order command queue
+  //
+  // and launch up to 3 kernels
+  //
+  // for each kernel:
+  //
+  //   set runtime "global" kernel args:
+  //
+  //   - block pool atomics
+  //   - block pool extent
+  //
+  //   set cohort "local" kernel args:
+  //
+  //   - atomics
+  //   - cmds
+  //
+  // enqueue barrier
+  // enqueue copy back of atomics on the command queue
+  // set callback on copy back event
+  // release command queue
+  //
+  struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr;
+
+  if (atomics->cmds > 0)
+    {
+      cl(SetKernelArg(impl->kernels.rasterize_all,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,1,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,2,SKC_CL_ARG(runtime->block_pool.ids.drw)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
+
+      cl(SetKernelArg(impl->kernels.rasterize_all,4,SKC_CL_ARG(cohort->atomics.drw)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,5,SKC_CL_ARG(cohort->keys.drw)));
+
+      cl(SetKernelArg(impl->kernels.rasterize_all,6,SKC_CL_ARG(cohort->transforms.drN)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,7,SKC_CL_ARG(cohort->clips.drN)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,8,SKC_CL_ARG(cohort->cmds.drw)));
+      cl(SetKernelArg(impl->kernels.rasterize_all,9,SKC_CL_ARG(atomics->cmds)));
+
+      skc_device_enqueue_kernel(runtime->device,
+                                SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL,
+                                cohort->cq,
+                                impl->kernels.rasterize_all,
+                                atomics->cmds,
+                                0,NULL,NULL);
+    }
+
+  //
+  // copyback number of TTSK keys
+  //
+  cl_event complete;
+
+  skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_rasterize_cb,grid));
+  cl(ReleaseEvent(complete));
+
+  // flush command queue
+  cl(Flush(cohort->cq));
+
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        a
+  // clips             a
+  // fill_cmds         -
+  // cq                a
+  // cohort atomics    a
+  // cmds              a
+  // keys              a
+  // meta              -
+}
+
+static
+void
+skc_raster_cohort_fills_expand_cb(cl_event event, cl_int status, skc_grid_t const grid)
+{
+  SKC_CL_CB(status);
+
+  struct skc_raster_cohort * const cohort = skc_grid_get_data(grid);
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_rasterize,grid);
+}
+
+static
+void
+skc_raster_builder_cohort_grid_pfn_execute(skc_grid_t const grid)
+{
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        i
+  // clips             i
+  // fill_cmds         i
+  // cq                -
+  // cohort atomics    -
+  // cmds              -
+  // keys              -
+  // meta              -
+  //
+
+  // allocate the cohort
+  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
+
+  // get impl
+  struct skc_raster_builder_impl * const impl    = cohort->impl;
+  struct skc_runtime             * const runtime = impl->runtime;
+
+  // acquire in-order cq
+  cohort->cq = skc_runtime_acquire_cq_in_order(runtime);
+
+  // alloc the snapshot -- could be zero-sized
+  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
+                                    &impl->fill_cmds,
+                                    &cohort->fill_cmds,
+                                    cohort->cq,NULL);
+
+  // flush the cq to get the fill running
+  // cl(Flush(cohort->cq));
+
+  // create split atomics
+  skc_extent_thr_tdrw_alloc(runtime,&cohort->atomics,sizeof(struct skc_raster_cohort_atomic));
+
+  // zero the atomics
+  skc_extent_thr_tdrw_zero(&cohort->atomics,cohort->cq,NULL);
+
+  // get config
+  struct skc_config const * const config = runtime->config;
+
+  // acquire device-side extents
+  skc_extent_tdrw_alloc(runtime,
+                        &cohort->cmds,
+                        sizeof(union skc_cmd_rasterize) * config->raster_cohort.expand.cmds);
+
+  //
+  // FILLS EXPAND
+  //
+  // need result of cmd counts before launching RASTERIZE grids
+  //
+  // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host
+  // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device
+  // - or launch a device-wide grid that feeds itself but that's unsatisfying
+  //
+
+  // how many commands?  could be zero
+  skc_uint const work_size = skc_extent_ring_snap_count(cohort->fill_cmds.snap);
+
+  if (work_size > 0)
+    {
+      cl(SetKernelArg(impl->kernels.fills_expand,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+      cl(SetKernelArg(impl->kernels.fills_expand,1,SKC_CL_ARG(cohort->atomics.drw)));
+      cl(SetKernelArg(impl->kernels.fills_expand,2,SKC_CL_ARG(runtime->handle_pool.map.drw)));
+      cl(SetKernelArg(impl->kernels.fills_expand,3,SKC_CL_ARG(cohort->fill_cmds.drN)));
+      cl(SetKernelArg(impl->kernels.fills_expand,4,SKC_CL_ARG(cohort->cmds.drw)));
+
+      skc_device_enqueue_kernel(runtime->device,
+                                SKC_DEVICE_KERNEL_ID_FILLS_EXPAND,
+                                cohort->cq,
+                                impl->kernels.fills_expand,
+                                work_size,
+                                0,NULL,NULL);
+    }
+
+  //
+  // copyback number of rasterization commands
+  //
+  cl_event complete;
+
+  skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_fills_expand_cb,grid));
+  cl(ReleaseEvent(complete));
+
+  // flush command queue
+  cl(Flush(cohort->cq));
+
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        i
+  // clips             i
+  // fill_cmds         s
+  // cq                a
+  // cohort atomics    a
+  // cmds              a
+  // keys              -
+  // meta              -
+  //
+}
+
+//
+// move grid into waiting state
+//
+// this entails allocating a cohort from the temporary extent
+//
+
+static
+void
+skc_raster_builder_cohort_grid_pfn_waiting(skc_grid_t const grid)
+{
+  // get the impl
+  struct skc_raster_builder_impl * const impl    = skc_grid_get_data(grid);
+  struct skc_runtime             * const runtime = impl->runtime;
+
+  // retain the raster builder
+  impl->raster_builder->refcount += 1;
+
+  // allocate the ephemeral/temp cohort
+  skc_subbuf_id_t id;
+
+  struct skc_raster_cohort * const cohort = 
+    skc_runtime_host_temp_alloc(runtime,
+                                SKC_MEM_FLAGS_READ_WRITE,
+                                sizeof(*cohort),
+                                &id,
+                                NULL);
+
+  // save the id and backpointer
+  cohort->id   = id;
+  cohort->impl = impl;
+
+  // set grid data -- replaces impl
+  skc_grid_set_data(grid,cohort);
+
+  //
+  // ACQUIRE RESOURCES FOR THE COHORT
+  //
+
+  struct skc_raster_builder * const raster_builder = impl->raster_builder;
+
+  // immediately take snapshots of all rings -- these are very inexpensive operations
+  skc_extent_phrwg_thr1s_snap_init(runtime,&raster_builder->path_ids  .ring,&cohort->path_ids);
+  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->transforms.ring,&cohort->transforms);
+  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->clips     .ring,&cohort->clips);
+  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->fill_cmds .ring,&cohort->fill_cmds);
+  skc_extent_phrwg_tdrNs_snap_init(runtime,&raster_builder->raster_ids.ring,&cohort->raster_ids);
+
+  //
+  // ALLOCATED RESOURCES
+  //
+  // path_ids          i
+  // raster_ids        i
+  // transforms        i
+  // clips             i
+  // fill_cmds         i
+  // cq                -
+  // cohort atomics    -
+  // cmds              -
+  // keys              -
+  // meta              -
+  //
+}
+
+//
+//
+//
+
+static
+void
+skc_raster_builder_cohort_create(struct skc_raster_builder_impl * const impl)
+{
+  // attach a grid
+  impl->cohort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                                      &impl->cohort,
+                                      impl,
+                                      skc_raster_builder_cohort_grid_pfn_waiting,
+                                      skc_raster_builder_cohort_grid_pfn_execute,
+                                      skc_raster_builder_cohort_grid_pfn_dispose);
+}
+
+//
+//
+//
+
+static
+skc_err
+skc_raster_builder_pfn_add(struct skc_raster_builder_impl * const impl, 
+                           skc_path_t               const *       paths,
+                           skc_uint                               count)
+{
+  // validate and retain the path
+  skc_err err;
+
+  err = skc_runtime_handle_device_validate_retain(impl->runtime,
+                                                  SKC_TYPED_HANDLE_TYPE_IS_PATH,
+                                                  paths,
+                                                  count);
+
+  if (err)
+    return err;
+
+  skc_runtime_handle_device_retain(impl->runtime,paths,count);
+
+  // make sure there is a grid
+  if (impl->cohort == NULL) {
+    skc_raster_builder_cohort_create(impl);
+  }
+
+  // declare rasterization grid happens after path
+  while (count-- > 0)
+    skc_grid_happens_after_handle(impl->cohort,SKC_TYPED_HANDLE_TO_HANDLE(*paths++));
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
+
+static
+void
+skc_raster_builder_pfn_end(struct skc_raster_builder_impl * const impl, skc_raster_t * const raster)
+{
+  //
+  // acquire host-managed path raster handle and bump reference count
+  // to 2 handles will be released (reduced to 1) once the rasters are
+  // completely rasterized
+  //
+  *raster = skc_runtime_handle_device_acquire(impl->runtime);
+
+  // make sure there is a grid
+  if (impl->cohort == NULL) {
+    skc_raster_builder_cohort_create(impl);
+  }
+
+  // map a handle to a grid
+  skc_grid_map(impl->cohort,*raster);
+}
+
+//
+// snapshot the ring and lazily start the grid
+//
+// FIXME -- might want to revisit this and settle on an even more
+// opaque implementation.  Some options:
+//
+//  - never let the SKC API expose a forced grid start
+//  - make snapshots kick off a forced grid start
+//  - be lazy all the time everywhere
+//
+
+static
+void
+skc_raster_builder_pfn_start(struct skc_raster_builder_impl * const impl)
+{
+  skc_grid_t const cohort = impl->cohort;
+
+  if (cohort != NULL) {
+    skc_grid_start(cohort);
+  }
+}
+
+//
+// NOTE: THIS MIGHT BE REMOVED
+//
+
+static
+void
+skc_raster_builder_pfn_force(struct skc_raster_builder_impl * const impl)
+{
+  skc_grid_t const cohort = impl->cohort;
+
+  if (cohort != NULL) {
+    skc_grid_force(cohort);
+  }
+}
+
+//
+//
+//
+
+skc_err
+skc_raster_builder_cl_12_create(struct skc_context          * const context,
+                                struct skc_raster_builder * * const raster_builder)
+{
+  struct skc_runtime * const runtime = context->runtime;
+
+  // allocate raster builder
+  (*raster_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**raster_builder));
+
+  // refcount
+  (*raster_builder)->refcount = 1;
+
+  // state
+  SKC_ASSERT_STATE_INIT((*raster_builder),SKC_RASTER_BUILDER_STATE_READY);
+
+  // allocate runtime raster builder
+  struct skc_raster_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  // save the impl
+  (*raster_builder)->impl = impl;
+
+  // intialize impl
+  impl->raster_builder = (*raster_builder);
+  impl->runtime        = runtime;
+  impl->cohort         = NULL;
+
+  // get config
+  struct skc_config const * const config = runtime->config;
+
+  skc_extent_phrwg_thr1s_alloc(runtime,&impl->path_ids  ,sizeof(skc_path_t         ) * config->raster_cohort.path_ids  .elem_count);
+  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->transforms,sizeof(union skc_transform) * config->raster_cohort.transforms.elem_count);
+  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->clips     ,sizeof(union skc_path_clip) * config->raster_cohort.clips     .elem_count);
+  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->fill_cmds ,sizeof(union skc_cmd_fill ) * config->raster_cohort.fill      .elem_count);
+  skc_extent_phrwg_tdrNs_alloc(runtime,&impl->raster_ids,sizeof(skc_raster_t       ) * config->raster_cohort.raster_ids.elem_count);
+
+  // retain the context
+  //skc_context_retain(context);
+
+  (*raster_builder)->context = context;
+
+  (*raster_builder)->add     = skc_raster_builder_pfn_add;
+  (*raster_builder)->end     = skc_raster_builder_pfn_end;
+  (*raster_builder)->start   = skc_raster_builder_pfn_start;
+  (*raster_builder)->force   = skc_raster_builder_pfn_force;
+  (*raster_builder)->release = skc_raster_builder_pfn_release;
+
+  // initialize raster builder with host-writable buffers
+  (*raster_builder)->path_ids  .extent = impl->path_ids.hrw;
+  (*raster_builder)->transforms.extent = impl->transforms.hw1;
+  (*raster_builder)->clips     .extent = impl->clips.hw1;
+  (*raster_builder)->fill_cmds .extent = impl->fill_cmds.hw1;
+  (*raster_builder)->raster_ids.extent = impl->raster_ids.hrw;
+
+  //
+  // the rings perform bookkeeping on the extents
+  //
+  // the ring snapshotting and checkpointing are necessary because
+  // another part of the API can _force_ the raster cohort to flush
+  // its work-in-progress commands but only up to a checkpointed
+  // boundary
+  //
+  skc_extent_ring_init(&(*raster_builder)->path_ids.ring,
+                       config->raster_cohort.path_ids.elem_count,
+                       config->raster_cohort.path_ids.snap_count,
+                       sizeof(skc_path_t));
+
+  skc_extent_ring_init(&(*raster_builder)->transforms.ring,
+                       config->raster_cohort.transforms.elem_count,
+                       config->raster_cohort.transforms.snap_count,
+                       sizeof(union skc_transform));
+
+  skc_extent_ring_init(&(*raster_builder)->clips.ring,
+                       config->raster_cohort.clips.elem_count,
+                       config->raster_cohort.clips.snap_count,
+                       sizeof(union skc_path_clip));
+
+  skc_extent_ring_init(&(*raster_builder)->fill_cmds.ring,
+                       config->raster_cohort.fill.elem_count,
+                       config->raster_cohort.fill.snap_count,
+                       sizeof(union skc_cmd_fill));
+
+  skc_extent_ring_init(&(*raster_builder)->raster_ids.ring,
+                       config->raster_cohort.raster_ids.elem_count,
+                       config->raster_cohort.raster_ids.snap_count,
+                       sizeof(skc_raster_t));
+
+  //
+  // acquire kernels
+  //
+  impl->kernels.fills_expand     = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_FILLS_EXPAND);
+  impl->kernels.rasterize_all    = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL);  
+
+#if 0
+  impl->kernels.rasterize_lines  = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES);
+  impl->kernels.rasterize_quads  = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS);
+  impl->kernels.rasterize_cubics = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS);
+#endif
+
+  impl->kernels.segment          = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK);
+  impl->kernels.rasters_alloc    = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC);
+  impl->kernels.prefix           = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PREFIX);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h
new file mode 100644
index 0000000000..f6e1751ef1
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_RASTER_BUILDER_CL_12_ONCE
+#define SKC_RASTER_BUILDER_CL_12_ONCE
+
+//
+//
+//
+
+#include "types.h"
+#include "macros.h"
+#include "common.h"
+
+//
+// FIXME -- these magic numbers will be replaced with tile.h constants
+// although they're probably universal across all devices
+//
+// FIXME -- NEED TO EVALUATE IF THIS DISTRIBUTION OF BITS IS GOING TO
+// BE TOO SMALL -- plenty of room to jiggle these bits
+//
+
+#define SKC_CMD_RASTERIZE_BITS_TRANSFORM  12
+#define SKC_CMD_RASTERIZE_BITS_CLIP       12
+#define SKC_CMD_RASTERIZE_BITS_COHORT      8
+
+SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_TRANSFORM == SKC_CMD_FILL_BITS_TRANSFORM);
+SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_CLIP      == SKC_CMD_FILL_BITS_CLIP);
+SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_COHORT    == SKC_CMD_FILL_BITS_COHORT);
+
+//
+// device-side rasterization cmd
+//
+
+union skc_cmd_rasterize
+{
+  skc_ulong    u64;
+
+  skc_uint2    u32v2;
+
+  struct {
+    //
+    // Unlike anywhere else in the pipeline, the nodeword index points
+    // "inside" of a path node (with word resolution). This means
+    // there is up to 16 GB of 32-bit word addressing in a unified
+    // block pool:
+    //
+    // "16GB ought to be enough for anyone" -- ASM 5/30/17
+    //
+    skc_uint   nodeword;
+#if defined(__OPENCL_C_VERSION__)
+    skc_uint   tcc;
+#else
+    skc_uint   transform : SKC_CMD_RASTERIZE_BITS_TRANSFORM;
+    skc_uint   clip      : SKC_CMD_RASTERIZE_BITS_CLIP;
+    skc_uint   cohort    : SKC_CMD_RASTERIZE_BITS_COHORT;
+#endif
+  };
+};
+
+SKC_STATIC_ASSERT(sizeof(union skc_cmd_rasterize) == sizeof(skc_uint2));
+
+//
+//
+//
+
+#define SKC_CMD_RASTERIZE_HI_OFFSET_COHORT  (SKC_CMD_RASTERIZE_BITS_TRANSFORM + SKC_CMD_RASTERIZE_BITS_CLIP)
+#define SKC_CMD_RASTERIZE_MASK_COHORT(c)    ((c).u32v2.hi & SKC_BITS_TO_MASK_AT(SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT))
+
+#define SKC_CMD_RASTERIZE_GET_TRANSFORM(c)  ((c).u32v2.hi & SKC_BITS_TO_MASK(SKC_CMD_RASTERIZE_BITS_TRANSFORM))
+#define SKC_CMD_RASTERIZE_GET_CLIP(c)       SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_CLIP,SKC_CMD_RASTERIZE_BITS_TRANSFORM)
+#define SKC_CMD_RASTERIZE_GET_COHORT(c)     ((c).u32v2.hi >> SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)
+// SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)
+
+//
+//
+//
+
+#define SKC_TTSK_SIZE_COHORT                (1 << SKC_CMD_RASTERIZE_BITS_COHORT)
+
+//
+// COHORT META DATA
+//
+
+union skc_raster_cohort_meta_in
+{
+  skc_uint4  u32v4;
+
+  struct {
+    skc_uint blocks; // # of rk blocks
+    skc_uint offset; // start of rk span
+    skc_uint pk;     // # of pk keys
+    skc_uint rk;     // # of rk keys
+  };
+};
+
+union skc_raster_cohort_meta_out
+{
+  skc_uint4  u32v4;
+
+  struct {
+    skc_uint blocks; // # of blocks in raster -- initially just rk blocks
+    skc_uint offset; // start of rk span
+    skc_uint nodes;  // # of nodes in raster  -- necessary for walking
+    skc_uint keys;   // # of rk & pk keys     -- initially just rk
+  };
+};
+
+union skc_raster_cohort_meta_inout
+{
+  union skc_raster_cohort_meta_in  in;
+  union skc_raster_cohort_meta_out out;
+};
+
+//
+// followed by one word for the offset
+//
+
+struct skc_raster_cohort_meta
+{
+  union skc_raster_cohort_meta_inout inout[SKC_TTSK_SIZE_COHORT];
+  skc_uint                           reads[SKC_TTSK_SIZE_COHORT]; // starting ring reads  -- [0] is raster head
+};
+
+#define SKC_RASTER_COHORT_META_OFFSET_READS (SKC_OFFSET_OF(struct skc_raster_cohort_meta,reads) / sizeof(skc_uint))
+
+//
+// COHORT ATOMICS
+//
+
+struct skc_raster_cohort_atomic
+{
+  // rasterization input
+  skc_uint cmds;
+
+  // rasterization output
+  skc_uint keys;
+
+  // block pool base -- idea here is to perform one atomic allocation
+  // skc_uint bp_base;
+};
+
+#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS      0
+#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS      1
+
+#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,cmds) / sizeof(skc_uint))
+#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,keys) / sizeof(skc_uint))
+
+SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS == SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC); // verify
+SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS == SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC); // verify
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.c b/src/compute/skc/platforms/cl_12/runtime_cl.c
new file mode 100644
index 0000000000..a745ed013e
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/runtime_cl.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+//
+//
+//
+
+#include "runtime_cl.h"
+#include "common/cl/assert_cl.h"
+
+//
+//
+//
+
+static is_verbose = true;
+
+//
+// FIXME -- all variable length device queries need to start querying
+// the parameter's return size before getting its value
+//
+// FIXME -- this is now handled by the common/cl/find.* routine
+//
+
+union skc_cl_device_version {
+  struct {
+    cl_uchar opencl_space[7]; // "OpenCL_"
+    cl_uchar major;
+    cl_uchar dot;
+    cl_uchar minor;
+#if 1 // Intel NEO requires at least 16 bytes
+    cl_uchar space;
+    cl_uchar vendor[32];
+#endif
+  };
+  struct {
+    cl_uchar aN[];
+  };
+};
+
+typedef cl_bitfield cl_diagnostic_verbose_level_intel;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
+
+static
+void 
+CL_CALLBACK 
+skc_context_callback(char const * error, void const * info, size_t size, void * user)
+{
+  if (info != NULL )
+    {
+      fprintf(stderr,"%s\n",error);
+    }
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
+                      char const            * const target_platform_substring,
+                      char const            * const target_device_substring,
+                      cl_context_properties         context_properties[])
+{
+  skc_err err = SKC_ERR_SUCCESS;
+  
+  //
+  // search available devices for a match
+  //
+#define PLATFORM_IDS_MAX         16
+#define DEVICE_IDS_MAX           16
+#define PLATFORM_NAME_SIZE_MAX   64
+#define DEVICE_NAME_SIZE_MAX     64
+#define DRIVER_VERSION_SIZE_MAX  64
+
+  cl_int         cl_err;
+
+  cl_platform_id platform_ids[PLATFORM_IDS_MAX];
+  cl_device_id   device_ids  [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
+
+  cl_uint        platform_count;
+  cl_uint        device_count[PLATFORM_IDS_MAX];
+  
+  cl_uint        platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
+
+  bool           match = false; // find _first_ match
+
+  //
+  // get number of platforms
+  //
+  cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
+
+  //
+  // search platforms
+  //
+  for (cl_uint ii=0; ii<platform_count; ii++)
+    {
+      char platform_name[PLATFORM_NAME_SIZE_MAX];
+
+      cl(GetPlatformInfo(platform_ids[ii],
+                         CL_PLATFORM_NAME,
+                         sizeof(platform_name),
+                         platform_name,
+                         NULL));
+
+      if (!match && (strstr(platform_name,target_platform_substring) != NULL)) 
+        {
+          platform_idx = ii;
+        }
+
+      if (is_verbose) {
+        fprintf(stdout,"%2u: %s\n",ii,platform_name);
+      }
+
+      cl_err = clGetDeviceIDs(platform_ids[ii],
+                              CL_DEVICE_TYPE_ALL,
+                              DEVICE_IDS_MAX,
+                              device_ids[ii],
+                              device_count+ii);
+
+      if (cl_err != CL_DEVICE_NOT_FOUND)
+        cl_ok(cl_err);
+
+      for (cl_uint jj=0; jj<device_count[ii]; jj++)
+        {
+          char                        device_name[DEVICE_NAME_SIZE_MAX];
+          union skc_cl_device_version device_version;
+          cl_uint                     device_align_bits;
+          char                        driver_version[DRIVER_VERSION_SIZE_MAX];
+
+          cl(GetDeviceInfo(device_ids[ii][jj],
+                           CL_DEVICE_NAME,
+                           sizeof(device_name),
+                           device_name,
+                           NULL));
+
+          // FIXME -- some of these variable length parameters should
+          // use the "size the param before reading" idiom
+          cl(GetDeviceInfo(device_ids[ii][jj],
+                           CL_DEVICE_VERSION,
+                           sizeof(device_version),
+                           device_version.aN,
+                           NULL));
+
+          cl(GetDeviceInfo(device_ids[ii][jj],
+                           CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                           sizeof(device_align_bits),
+                           &device_align_bits,
+                           NULL));
+          
+          cl_uint const base_align = device_align_bits / 8; // bytes
+
+          cl(GetDeviceInfo(device_ids[ii][jj],
+                           CL_DRIVER_VERSION,
+                           sizeof(driver_version),
+                           driver_version,
+                           NULL));
+          
+          if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
+            {
+              match      = true;
+              device_idx = jj;
+
+              runtime_cl->version.major = device_version.major - 48;
+              runtime_cl->version.minor = device_version.minor - 48;
+              runtime_cl->base_align    = base_align;
+
+              if (is_verbose) {
+                fprintf(stdout," >>>");
+              }
+            }
+          else if (is_verbose) 
+            {
+              fprintf(stdout,"    ");
+            }
+
+          if (is_verbose) {
+            fprintf(stdout,
+                    " %1u: %s [ %s ] [ %s ] [ %u ]\n",
+                    jj,
+                    device_name,
+                    device_version.aN,
+                    driver_version,
+                    base_align);
+          }
+        }
+    }
+
+  if (is_verbose) {
+    fprintf(stdout,"\n");
+  }
+
+  //
+  // get target platform and device
+  //
+  if (platform_idx >= platform_count)
+    {
+      fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
+      exit(EXIT_FAILURE);
+    }
+  if (device_idx >= device_count[platform_idx])
+    {
+      fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
+      exit(EXIT_FAILURE);
+    }
+
+  runtime_cl->platform_id = platform_ids[platform_idx];
+  runtime_cl->device_id   = device_ids  [platform_idx][device_idx];
+
+  //
+  // create context
+  //
+
+#if 0
+  cl_context_properties context_properties[] = 
+    { 
+      CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
+      0 
+    };
+#else
+  context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
+#endif
+
+  runtime_cl->context = clCreateContext(context_properties,
+                                    1,
+                                    &runtime_cl->device_id,
+                                    skc_context_callback,
+                                    NULL,
+                                    &cl_err);
+  cl_ok(cl_err);
+
+  //
+  // get device name, driver version, and unified memory flag
+  //
+  if (is_verbose)
+    {
+      char                       device_name[DEVICE_NAME_SIZE_MAX];
+      char                       driver_version[DRIVER_VERSION_SIZE_MAX];
+      cl_bool                    device_is_unified; 
+      cl_device_svm_capabilities svm_caps;
+      size_t                     printf_buffer_size;
+
+      cl(GetDeviceInfo(runtime_cl->device_id,
+                       CL_DEVICE_NAME,
+                       sizeof(device_name),
+                       device_name,
+                       NULL));
+
+      cl(GetDeviceInfo(runtime_cl->device_id,
+                       CL_DRIVER_VERSION,
+                       sizeof(driver_version),
+                       driver_version,
+                       NULL));
+
+      cl(GetDeviceInfo(runtime_cl->device_id,
+                       CL_DEVICE_HOST_UNIFIED_MEMORY,
+                       sizeof(device_is_unified),
+                       &device_is_unified,
+                       NULL));
+
+      cl(GetDeviceInfo(runtime_cl->device_id,
+                       CL_DEVICE_SVM_CAPABILITIES,
+                       sizeof(svm_caps),
+                       &svm_caps,
+                       0));
+
+      cl(GetDeviceInfo(runtime_cl->device_id,
+                       CL_DEVICE_PRINTF_BUFFER_SIZE,
+                       sizeof(printf_buffer_size),
+                       &printf_buffer_size,
+                       NULL));
+
+      fprintf(stderr,
+              "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER  %c\n"
+              "CL_DEVICE_SVM_FINE_GRAIN_BUFFER    %c\n"
+              "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM    %c\n"
+              "CL_DEVICE_SVM_ATOMICS              %c\n"
+              "CL_DEVICE_PRINTF_BUFFER_SIZE       %zu\n\n",
+              svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
+              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER   ? '*' : '-',
+              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM   ? '*' : '-',
+              svm_caps & CL_DEVICE_SVM_ATOMICS             ? '*' : '-',
+              printf_buffer_size);
+    }
+
+  return err;
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
+{
+  // FIXME
+  printf("%s incomplete!\n",__func__);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
+
+cl_command_queue
+skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
+{
+  cl_command_queue cq;
+
+  if (runtime_cl->version.major < 2)
+    {
+      //
+      // <= OpenCL 1.2
+      //
+      cl_int cl_err;
+
+      cq = clCreateCommandQueue(runtime_cl->context,
+                                runtime_cl->device_id,
+                                (cl_command_queue_properties)type,
+                                &cl_err); cl_ok(cl_err);  
+    }
+  else
+    {
+      //
+      // >= OpenCL 2.0
+      //
+      cl_int                    cl_err;
+      cl_queue_properties const queue_properties[] = {
+        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
+      };
+
+      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
+                                              runtime_cl->device_id,
+                                              queue_properties,
+                                              &cl_err); cl_ok(cl_err);
+    }
+
+  return cq;
+}
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.h b/src/compute/skc/platforms/cl_12/runtime_cl.h
new file mode 100644
index 0000000000..9e58ca0cc7
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/runtime_cl.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+// squelch OpenCL 1.2 deprecation warning
+//
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+
+#include <CL/opencl.h>
+
+//
+//
+//
+
+#include "skc.h"
+
+//
+// Minimal OpenCL state needed by the runtime to get started
+//
+
+struct skc_runtime_cl
+{
+  cl_platform_id platform_id;
+  cl_device_id   device_id;
+  cl_context     context;
+  
+  struct {
+    cl_uint      major;
+    cl_uint      minor;
+  } version; // sometimes we need to know this at runtime 
+
+  cl_uint        base_align; // base address alignment for subbuffer origins
+};
+
+//
+//
+//
+
+typedef enum skc_cq_type_e {
+  SKC_CQ_TYPE_IN_ORDER               = 0,
+  SKC_CQ_TYPE_OUT_OF_ORDER           = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+  SKC_CQ_TYPE_IN_ORDER_PROFILING     = (SKC_CQ_TYPE_IN_ORDER     | CL_QUEUE_PROFILING_ENABLE),
+  SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
+} skc_cq_type_e;
+
+//
+// safely creates a generic OpenCL target in very few lines
+//
+
+skc_err
+skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
+                      char const            * const target_platform_substring,
+                      char const            * const target_device_substring,
+                      cl_context_properties         context_properties[]);
+
+skc_err
+skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
+
+//
+// create a command queue with the non-deprecated function
+//
+
+cl_command_queue
+skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
new file mode 100644
index 0000000000..fca13edbbd
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+//
+//
+//
+
+#include "context.h"
+#include "block.h"
+#include "grid.h"
+#include "common/cl/assert_cl.h"
+#include "config_cl.h"
+#include "runtime_cl.h"
+#include "runtime_cl_12.h"
+#include "export_cl_12.h"
+
+//
+//
+//
+
+static 
+void
+skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
+{
+  // save size
+  runtime->block_pool.size = &runtime->config->block_pool;
+
+  // create block extent
+  skc_extent_pdrw_alloc(runtime,
+                        &runtime->block_pool.blocks,
+                        runtime->block_pool.size->pool_size * 
+                        runtime->config->block.bytes);
+
+  // allocate block pool ids
+  skc_extent_pdrw_alloc(runtime,
+                        &runtime->block_pool.ids,
+                        runtime->block_pool.size->ring_pow2 * sizeof(skc_uint));
+
+  // allocate block pool atomics
+  skc_extent_phr_pdrw_alloc(runtime,
+                            &runtime->block_pool.atomics,
+                            sizeof(union skc_block_pool_atomic));
+
+  // acquire pool id and atomic initialization kernels
+  cl_kernel k0 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS);
+  cl_kernel k1 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS);
+
+  // init ids
+  cl(SetKernelArg(k0,0,sizeof(runtime->block_pool.ids.drw),&runtime->block_pool.ids.drw));
+  cl(SetKernelArg(k0,1,SKC_CL_ARG(runtime->block_pool.size->pool_size)));
+
+  // the kernel grid is shaped by the target device -- always 2 for atomics
+  skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS,
+                            cq,k0,runtime->block_pool.size->pool_size,
+                            0,NULL,NULL);
+
+  // init atomics
+  cl(SetKernelArg(k1,0,sizeof(runtime->block_pool.atomics.drw),&runtime->block_pool.atomics.drw));
+  cl(SetKernelArg(k1,1,SKC_CL_ARG(runtime->block_pool.size->pool_size)));
+
+  // the kernel grid is shaped by the target device
+  skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS,
+                            cq,k1,2,
+                            0,NULL,NULL);
+
+  // kickstart kernel execution
+  cl(Flush(cq));
+
+  // release kernels
+  cl(ReleaseKernel(k0));
+  cl(ReleaseKernel(k1));
+}
+
+static 
+void
+skc_block_pool_dispose(struct skc_runtime * const runtime)
+{
+  skc_extent_phr_pdrw_free(runtime,&runtime->block_pool.atomics);
+  skc_extent_pdrw_free    (runtime,&runtime->block_pool.ids);
+  skc_extent_pdrw_free    (runtime,&runtime->block_pool.blocks);
+}
+
+//
+//
+//
+
+static
+bool
+skc_runtime_yield(struct skc_runtime * const runtime)
+{
+  return skc_scheduler_yield(runtime->scheduler);
+}
+
+static
+void 
+skc_runtime_wait(struct skc_runtime * const runtime)
+{
+  skc_scheduler_wait(runtime->scheduler);
+}
+
+//
+//
+//
+
+skc_err
+skc_runtime_cl_12_create(struct skc_context * const context,
+                         char const         * const target_platform_substring,
+                         char const         * const target_device_substring,
+                         cl_context_properties      context_properties[])
+{
+  // allocate the runtime
+  struct skc_runtime * const runtime = malloc(sizeof(*runtime));
+
+  // acquire OpenCL ids and context for target device
+  skc_err err = skc_runtime_cl_create(&runtime->cl,
+                                      target_platform_substring,
+                                      target_device_substring,
+                                      context_properties);
+
+  // create device
+  skc_device_create(runtime);
+
+  // create the host and device allocators
+  skc_allocator_host_create(runtime);
+  skc_allocator_device_create(runtime);
+
+  // how many slots in the scheduler?
+  runtime->scheduler = skc_scheduler_create(runtime,runtime->config->scheduler.size);
+
+  // allocate deps structure
+  runtime->deps      = skc_grid_deps_create(runtime,
+                                            runtime->scheduler,
+                                            runtime->config->block_pool.pool_size);
+
+  // initialize cq pool
+  skc_cq_pool_create(runtime,
+                     &runtime->cq_pool,
+                     runtime->config->cq_pool.type,
+                     runtime->config->cq_pool.size);
+
+  // acquire in-order cq
+  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
+
+  // initialize block pool
+  skc_block_pool_create(runtime,cq);
+
+  // intialize handle pool
+  skc_handle_pool_create(runtime,
+                         &runtime->handle_pool,
+                         runtime->config->handle_pool.size,
+                         runtime->config->handle_pool.width,
+                         runtime->config->handle_pool.recs);
+
+  //
+  // initialize pfns
+  //
+  // FIXME -- at this point we will have identified which device we've
+  // targeted and will load a DLL (or select from a built-in library)
+  // that contains all the pfns.
+  //
+  context->runtime        = runtime;
+
+  context->yield          = skc_runtime_yield;
+  context->wait           = skc_runtime_wait;
+  
+  context->path_builder   = skc_path_builder_cl_12_create;
+  context->path_retain    = skc_runtime_path_host_retain;
+  context->path_release   = skc_runtime_path_host_release;
+  context->path_flush     = skc_runtime_path_host_flush;
+
+  context->raster_builder = skc_raster_builder_cl_12_create;
+  context->raster_retain  = skc_runtime_raster_host_retain;
+  context->raster_release = skc_runtime_raster_host_release;
+  context->raster_flush   = skc_runtime_raster_host_flush;
+
+  context->composition    = skc_composition_cl_12_create;
+  context->styling        = skc_styling_cl_12_create;
+  
+  context->surface        = skc_surface_cl_12_create;
+
+  // block on pool creation
+  cl(Finish(cq));
+
+  // dispose of in-order cq
+  skc_runtime_release_cq_in_order(runtime,cq);
+
+  return err;
+};
+
+//
+//
+//
+
+skc_err
+skc_runtime_cl_12_dispose(struct skc_context * const context)
+{
+  //
+  // FIXME -- incomplete
+  //
+  fprintf(stderr,"%s incomplete!\n",__func__);
+
+  struct skc_runtime * runtime = context->runtime;
+
+  skc_allocator_device_dispose(runtime);
+  skc_allocator_host_dispose(runtime);
+
+  skc_scheduler_dispose(context->runtime,context->runtime->scheduler);
+
+  skc_grid_deps_dispose(context->runtime->deps);
+
+  skc_cq_pool_dispose(runtime,&runtime->cq_pool);
+
+  skc_block_pool_dispose(context->runtime);
+
+  // skc_handle_pool_dispose(context->runtime);
+  
+  return SKC_ERR_SUCCESS;
+}
+
+//
+// TEMPORARY BENCHMARK
+//
+
+#if 1
+
+#include <windows.h>
+
+#define SKC_FRAMES_MASK 0x7F
+#define SKC_FRAMES      (SKC_FRAMES_MASK + 1)
+
+void
+skc_runtime_cl_12_debug(struct skc_context * const context)
+{
+#ifdef NDEBUG
+  static skc_uint      frames=0;
+  static LARGE_INTEGER StartingTime={0}, EndingTime;
+
+  if ((frames++ & SKC_FRAMES_MASK) != SKC_FRAMES_MASK)
+    return;
+
+  QueryPerformanceCounter(&EndingTime);
+  
+  LARGE_INTEGER ElapsedMicroseconds, Frequency;
+
+  ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
+
+  QueryPerformanceFrequency(&Frequency);   
+
+  double const msecs_total  = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
+  double const msecs_frame  = msecs_total / SKC_FRAMES;
+
+  printf("Frames / Total / Per : %u / %.3f / %.3f\n",
+         SKC_FRAMES,msecs_total,msecs_frame);
+#endif
+
+  struct skc_runtime * const runtime = context->runtime;
+  
+  // acquire out-of-order cq
+  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
+
+  // copy atomics to host
+  skc_extent_phr_pdrw_read(&runtime->block_pool.atomics,cq,NULL);
+
+  // block until complete
+  cl(Finish(cq));
+
+  // dispose of out-of-order cq
+  skc_runtime_release_cq_in_order(runtime,cq);
+
+  union skc_block_pool_atomic const * const bp_atomic = runtime->block_pool.atomics.hr;
+
+  skc_uint const available = bp_atomic->writes - bp_atomic->reads;
+  skc_uint const inuse     = runtime->config->block_pool.pool_size - available;
+
+  fprintf(stderr,"w/r/f/a: %9u - %9u = %9u : %6.2f MB\n",
+          bp_atomic->writes,
+          bp_atomic->reads,
+          available,
+          (inuse * runtime->config->block.bytes) / (1024.0*1024.0));
+
+  if (available >= (1<<27))
+    {
+      fprintf(stderr,"block pool corrupted!\n");
+      exit(-1);
+    }
+
+  //
+  //
+  //
+#ifdef NDEBUG
+  QueryPerformanceCounter(&StartingTime);
+#endif
+}
+
+#endif
+
+//
+//
+//
+
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
new file mode 100644
index 0000000000..7e7ffcb284
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include "runtime.h"
+#include "runtime_cl.h"
+#include "cq_pool_cl.h"
+#include "handle_pool_cl_12.h"
+#include "block_pool_cl_12.h"
+#include "allocator_device_cl.h"
+
+//
+// FIXME -- two parts:
+//
+// 1. directly access the structures in the runtime sub-struct implementations
+// 2. possibly wall off the non-platform-specific structs into a sub structure
+//
+
+struct skc_runtime
+{
+  //
+  // state visible to device
+  //
+  struct skc_runtime_cl            cl;
+
+  struct {
+    struct skc_allocator_host      host;
+    struct skc_allocator_device    device;
+  } allocator;
+
+  struct skc_cq_pool               cq_pool;
+
+  struct skc_block_pool            block_pool;
+
+  struct skc_handle_pool           handle_pool;
+
+  //
+  // state that is slightly opaque (for now)
+  //
+  struct skc_scheduler           * scheduler;
+
+  struct skc_grid_deps           * deps;
+
+  struct skc_config const        * config; // FIXME: config will be determined by device with some opportunities to resize
+
+  struct skc_device              * device; // opaque bundle of kernels
+};
+
+//
+// Creation and disposal intitializes context and may rely on other
+// context resources like the scheduler
+//
+
+skc_err
+skc_runtime_cl_12_create(struct skc_context * const context,
+                         char const         * const target_platform_substring,
+                         char const         * const target_device_substring,
+                         cl_context_properties      context_properties[]);
+
+skc_err
+skc_runtime_cl_12_dispose(struct skc_context * const context);
+
+//
+// HOST HANDLE RETAIN/RELEASE/FLUSH
+//
+
+skc_err
+skc_runtime_path_host_retain(struct skc_runtime * const runtime,
+                             skc_path_t   const *       paths,
+                             uint32_t                   count);
+
+skc_err
+skc_runtime_raster_host_retain(struct skc_runtime * const runtime,
+                               skc_raster_t const *       rasters,
+                               uint32_t                   count);
+
+
+skc_err
+skc_runtime_path_host_release(struct skc_runtime * const runtime,
+                              skc_path_t   const *       paths,
+                              uint32_t                   count);
+
+skc_err
+skc_runtime_raster_host_release(struct skc_runtime * const runtime,
+                                skc_raster_t const *       rasters,
+                                uint32_t                   count);
+
+
+skc_err
+skc_runtime_path_host_flush(struct skc_runtime * const runtime,
+                            skc_path_t   const *       paths,
+                            uint32_t                   count);
+
+skc_err
+skc_runtime_raster_host_flush(struct skc_runtime * const runtime,
+                              skc_raster_t const *       rasters,
+                              uint32_t                   count);
+
+//
+// DEVICE/PIPELINE HANDLE ACQUIRE/RETAIN/RELEASE
+//
+// The retain operations pre-validate handles
+//
+
+skc_handle_t
+skc_runtime_handle_device_acquire(struct skc_runtime * const runtime);
+
+skc_err
+skc_runtime_handle_device_validate_retain(struct skc_runtime       * const runtime,
+                                          skc_typed_handle_type_e    const handle_type,
+                                          skc_typed_handle_t const *       typed_handles,
+                                          uint32_t                         count);
+
+void
+skc_runtime_handle_device_retain(struct skc_runtime * const runtime,
+                                 skc_handle_t const *       handles,
+                                 uint32_t                   count);
+
+void
+skc_runtime_path_device_release(struct skc_runtime * const runtime,
+                                skc_handle_t const *       handles,
+                                uint32_t                   count);
+
+void
+skc_runtime_raster_device_release(struct skc_runtime * const runtime,
+                                  skc_handle_t const *       handles,
+                                  uint32_t                   count);
+
+//
+// We only use in-order command queues in the pipeline
+//
+
+cl_command_queue
+skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime);
+
+void
+skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
+                                cl_command_queue           cq);
+
+//
+// DEVICE MEMORY ALLOCATION
+//
+
+cl_mem
+skc_runtime_device_perm_alloc(struct skc_runtime * const runtime,
+                              cl_mem_flags         const flags,
+                              size_t               const size);
+
+void
+skc_runtime_device_perm_free(struct skc_runtime * const runtime,
+                             cl_mem               const mem);
+
+cl_mem
+skc_runtime_device_temp_alloc(struct skc_runtime * const runtime,
+                              cl_mem_flags         const flags,
+                              size_t               const size,
+                              skc_subbuf_id_t    * const subbuf_id,
+                              size_t             * const subbuf_size);
+
+void
+skc_runtime_device_temp_free(struct skc_runtime * const runtime,
+                             cl_mem               const mem,
+                             skc_subbuf_id_t      const subbuf_id);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/styling_cl_12.c b/src/compute/skc/platforms/cl_12/styling_cl_12.c
new file mode 100644
index 0000000000..6c84fe6f70
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/styling_cl_12.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTES:
+//
+// - this particular object only needs a command queue for a short
+//   time so consider acquiring/releasing the command queue on demand
+//   but only if command queues are cached and expensive to keep
+//
+
+#include "common/cl/assert_cl.h"
+
+#include "styling_cl_12.h"
+#include "extent_cl_12.h"
+#include "runtime_cl_12.h"
+
+#include "context.h"
+#include "styling_types.h"
+
+//
+//
+//
+
+static
+void
+skc_styling_unmap_complete(skc_grid_t const grid)
+{
+  struct skc_styling_impl * const impl = skc_grid_get_data(grid);
+  
+  impl->state = SKC_STYLING_STATE_SEALED;
+
+  skc_grid_complete(grid);
+}
+
+static
+void
+skc_styling_unmap_cb(cl_event event, cl_int status, skc_grid_t const grid)
+{
+  SKC_CL_CB(status);
+
+  struct skc_styling_impl * const impl      = skc_grid_get_data(grid);
+  struct skc_scheduler    * const scheduler = impl->runtime->scheduler;
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(scheduler,skc_styling_unmap_complete,grid);
+}
+
+static
+void
+skc_styling_grid_pfn_execute(skc_grid_t const grid)
+{
+  struct skc_styling_impl * const impl    = skc_grid_get_data(grid);
+  struct skc_styling      * const styling = impl->styling;
+
+  //
+  // unmap all extents
+  //
+  cl_event complete;
+
+  skc_extent_phwN_pdrN_unmap(&impl->layers,styling->layers.extent,impl->cq,NULL);
+  skc_extent_phwN_pdrN_unmap(&impl->groups,styling->groups.extent,impl->cq,NULL);
+  skc_extent_phwN_pdrN_unmap(&impl->extras,styling->extras.extent,impl->cq,&complete);  
+
+  // set the event
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unmap_cb,grid));
+  cl(ReleaseEvent(complete));
+
+  // flush command queue
+  cl(Flush(impl->cq));
+}
+
+//
+//
+//
+
+static
+void
+skc_styling_pfn_seal(struct skc_styling_impl * const impl)
+{
+  // return if sealing or sealed
+  if (impl->state >= SKC_STYLING_STATE_SEALING)
+    return;
+
+  struct skc_runtime   * const runtime   = impl->runtime;
+  struct skc_scheduler * const scheduler = runtime->scheduler;
+
+  //
+  // otherwise, wait for UNSEALING > UNSEALED transition
+  //
+  if (impl->state == SKC_STYLING_STATE_UNSEALING)
+    {
+      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
+    }
+  
+  //
+  // we're unsealed so we need to seal and start the grid
+  //
+  impl->state = SKC_STYLING_STATE_SEALING;
+  impl->grid  = SKC_GRID_DEPS_ATTACH(runtime->deps,
+                                     NULL,
+                                     impl,
+                                     NULL,  // no waiting
+                                     skc_styling_grid_pfn_execute,
+                                     NULL); // no dispose
+
+  // no need to force -- styling has no dependencies
+  skc_grid_start(impl->grid);
+}
+
+//
+//
+//
+
+void
+skc_styling_unseal_complete(struct skc_styling_impl * const impl)
+{
+  struct skc_runtime * const runtime = impl->runtime;
+  
+  // we're now unsealed
+  impl->state = SKC_STYLING_STATE_UNSEALED;
+}
+
+static
+void
+skc_styling_unseal_cb(cl_event event, cl_int status, struct skc_styling_impl * const impl)
+{
+  SKC_CL_CB(status);
+  
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_styling_unseal_complete,impl);
+}
+
+static
+void
+skc_styling_pfn_unseal(struct skc_styling_impl * const impl, skc_bool const block)
+{
+  // return if already unsealed
+  if (impl->state == SKC_STYLING_STATE_UNSEALED)
+    return;
+
+  //
+  // otherwise, we're going to need to pump the scheduler
+  //
+  struct skc_runtime   * const runtime   = impl->runtime;
+  struct skc_scheduler * const scheduler = runtime->scheduler;
+
+  //
+  // wait for UNSEALING > UNSEALED transition
+  //
+  if (impl->state == SKC_STYLING_STATE_UNSEALING)
+    {
+      if (block) {
+        SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
+      }
+      return;
+    }
+
+  //
+  // otherwise, wait for SEALING > SEALED transition ...
+  //
+  if (impl->state == SKC_STYLING_STATE_SEALING)
+    {
+      // wait if sealing 
+      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED);
+    }
+  
+  // wait for rendering locks to be released
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0);
+
+  // ... and then unseal the styling object
+  impl->state = SKC_STYLING_STATE_UNSEALING;
+
+  // defensively NULL the grid reference
+  impl->grid  = NULL; // defensive
+
+  // set styling pointers with mapped extents
+  cl_event complete;
+
+  struct skc_styling * const styling = impl->styling;
+  
+  styling->layers.extent = skc_extent_phwN_pdrN_map(&impl->layers,impl->cq,NULL);
+  styling->groups.extent = skc_extent_phwN_pdrN_map(&impl->groups,impl->cq,NULL);
+  styling->extras.extent = skc_extent_phwN_pdrN_map(&impl->extras,impl->cq,&complete);
+
+  cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unseal_cb,impl));
+  cl(ReleaseEvent(complete));
+
+  // flush it
+  cl(Flush(impl->cq));
+
+  // wait until unsealed...
+  if (block) {
+    SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
+  }
+}
+
+//
+//
+//
+
+static
+void
+skc_styling_pfn_release(struct skc_styling_impl * const impl)
+{
+  if (--impl->styling->ref_count != 0)
+    return;
+
+  //
+  // otherwise, unmap all resources by sealing and delete
+  //
+  skc_styling_pfn_seal(impl);
+
+  struct skc_runtime   * const runtime   = impl->runtime;
+  struct skc_scheduler * const scheduler = runtime->scheduler;
+
+  // wait until sealed
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED);
+
+  // wait for locks to drain
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0)
+
+  //
+  // styling is now disposable
+  //
+
+  // free styling host
+  skc_runtime_host_perm_free(runtime,impl->styling);
+
+  // release the cq
+  skc_runtime_release_cq_in_order(runtime,impl->cq);
+  
+  // free extents
+  skc_extent_phwN_pdrN_free(runtime,&impl->layers);
+  skc_extent_phwN_pdrN_free(runtime,&impl->groups);
+  skc_extent_phwN_pdrN_free(runtime,&impl->extras);
+
+  // free styling impl
+  skc_runtime_host_perm_free(runtime,impl);
+}
+
+//
+//
+//
+
+void
+skc_styling_retain_and_lock(struct skc_styling * const styling)
+{
+  skc_styling_retain(styling);
+
+  styling->impl->lock_count += 1;
+}
+
+void
+skc_styling_unlock_and_release(struct skc_styling * const styling)
+{
+  styling->impl->lock_count -= 1;
+
+  skc_styling_pfn_release(styling->impl);
+}
+
+//
+//
+//
+
+skc_err
+skc_styling_cl_12_create(struct skc_context   * const context,
+                         struct skc_styling * * const styling,
+                         skc_uint               const layers_count,
+                         skc_uint               const groups_count,
+                         skc_uint               const extras_count)
+{
+  // retain the context
+  // skc_context_retain(context);
+
+  // allocate the impl
+  struct skc_runtime      * const runtime = context->runtime;
+  struct skc_styling_impl * const impl    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  // allocate styling
+  (*styling)          = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**styling));
+  (*styling)->context = context;
+  (*styling)->impl    = impl;
+
+  // intialize impl
+  impl->styling       = (*styling);
+  impl->runtime       = runtime;
+
+  SKC_ASSERT_STATE_INIT(impl,SKC_STYLING_STATE_SEALED);
+
+  impl->lock_count    = 0;
+
+  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
+  
+  //
+  // The styling object is unique in that the API lets the user
+  // specify resource limits
+  //
+  // The styling object is a simple container that can have wildly
+  // varying resource requirements (but still relatively modest).
+  //
+  // Additionally, an advanced SKC programmer may want to create many
+  // styling and composition objects as they're relatively cheap.
+  //
+  skc_extent_phwN_pdrN_alloc(runtime,&impl->layers,sizeof(*(*styling)->layers.extent) * layers_count);
+  skc_extent_phwN_pdrN_alloc(runtime,&impl->groups,sizeof(*(*styling)->groups.extent) * groups_count);
+  skc_extent_phwN_pdrN_alloc(runtime,&impl->extras,sizeof(*(*styling)->extras.extent) * extras_count);
+
+  // initialize styling
+  (*styling)->layers.size  = layers_count;
+  (*styling)->groups.size  = groups_count;
+  (*styling)->extras.size  = extras_count;
+
+  (*styling)->layers.count = 0;
+  (*styling)->groups.count = 0;
+  (*styling)->extras.count = 0;
+
+  // save pfns
+  (*styling)->seal         = skc_styling_pfn_seal;
+  (*styling)->unseal       = skc_styling_pfn_unseal;
+  (*styling)->release      = skc_styling_pfn_release;
+
+  // set ref count
+  (*styling)->ref_count    = 1;
+
+  // map the extents by unsealing
+  skc_styling_pfn_unseal(impl,false);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/styling_cl_12.h b/src/compute/skc/platforms/cl_12/styling_cl_12.h
new file mode 100644
index 0000000000..a319568ee5
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/styling_cl_12.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <CL/opencl.h>
+
+#include "styling.h"
+#include "grid.h"
+#include "extent_cl_12.h"
+#include "assert_state.h"
+
+//
+// styling states
+//
+
+typedef enum skc_styling_state_e {
+
+  SKC_STYLING_STATE_UNSEALING,
+  SKC_STYLING_STATE_UNSEALED,
+  SKC_STYLING_STATE_SEALING,
+  SKC_STYLING_STATE_SEALED
+
+} skc_styling_state_e;
+
+//
+// IMPL
+//
+
+struct skc_styling_impl
+{
+  struct skc_styling         * styling;
+  struct skc_runtime         * runtime;
+  
+  SKC_ASSERT_STATE_DECLARE(skc_styling_state_e);
+
+  skc_int                      lock_count;  // # of wip renders
+
+  skc_grid_t                   grid;
+
+  // in-order command queue
+  cl_command_queue             cq;
+
+  //
+  // only 3 extents
+  //
+  struct skc_extent_phwN_pdrN  layers;
+  struct skc_extent_phwN_pdrN  groups;
+  struct skc_extent_phwN_pdrN  extras;
+};
+
+//
+// ONLY VISIBLE WITHIN THIS RUNTIME
+//
+
+void
+skc_styling_retain_and_lock(struct skc_styling * const styling);
+
+void
+skc_styling_unlock_and_release(struct skc_styling * const styling);
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12.h b/src/compute/skc/platforms/cl_12/surface_cl_12.h
new file mode 100644
index 0000000000..43ea5428a5
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/surface_cl_12.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_SURFACE_CL_12_ONCE
+#define SKC_SURFACE_CL_12_ONCE
+
+//
+// Unlike other object platform implementations, the surface object
+// implementation needs to access the opaque platform-specfic outputs
+// of the composition and styling objects.
+//
+//  Composition : { keys,   offsets, key_count, offset_count }
+//  Styling     : { layers, groups,  commands                }
+//
+// With the OpenCL platform we'll handle this by simply exposing the
+// argument value (void*) and its size (size_t).
+//
+// TODO: It might make sense in the future to support more complex
+//       rendering jobs that simultaneously involve multiple surfaces,
+//       compositions and stylings.
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c
new file mode 100644
index 0000000000..cc7cba5225
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/surface_cl_12_buffer.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "common/cl/assert_cl.h"
+
+#include "extent_cl_12.h"
+#include "runtime_cl_12.h"
+#include "styling_cl_12.h"
+#include "composition_cl_12.h"
+
+#include "context.h"
+#include "surface.h"
+
+//
+//
+//
+
+#include <stdio.h>
+
+//
+// BUILD
+//
+
+struct skc_surface_impl
+{
+  struct skc_surface        * surface;
+  struct skc_runtime        * runtime;
+
+  // framebuffer
+  // struct skc_extent_pdrw      fb;
+  // struct skc_extent_phrN_pdwN fb;
+
+  // for now, a single in-order command queue
+  cl_command_queue            cq;
+
+  struct {
+    cl_kernel                 render;
+  } kernels;
+};
+
+//
+// we might want concurrent access to the same surface as long as
+// the clips don't overlap.
+//
+// this would require acquiring a cq on demand when it is determined
+// that the clipped render won't overlap
+//
+// { tile clip , cq } pair
+//
+// skc_uint4                clip;
+// cl_command_queue         cq
+//
+
+struct skc_surface_render
+{
+  skc_uint                      clip[4];
+
+  struct skc_surface_impl     * impl;
+  struct skc_styling          * styling;
+  struct skc_composition      * composition;
+
+  skc_surface_render_pfn_notify notify;
+  void                        * data;
+
+  cl_mem                        fb;
+
+  skc_grid_t                    grid;
+
+  skc_subbuf_id_t               id;
+};
+
+//
+//
+//
+
+static
+void
+skc_surface_pfn_clear(struct skc_surface_impl * const impl,
+                      float                     const rgba[4],
+                      skc_uint                  const rect[4],
+                      void                     *      fb)
+{
+  size_t const origin[3] = { rect[0], rect[1], 0 };
+  size_t const region[3] = { rect[2], rect[3], 1 };
+
+  cl(EnqueueFillImage(impl->cq,
+                      (cl_mem)fb,
+                      rgba,
+                      origin,
+                      region,
+                      0,NULL,NULL));
+}
+
+//
+//
+//
+
+static
+void
+skc_surface_pfn_blit(struct skc_surface_impl * const impl,
+                     skc_uint                  const rect[4],
+                     skc_int                   const txty[2])
+{
+  ;
+}
+
+//
+//
+//
+
+#if 0 // #ifndef NDEBUG
+#define SKC_SURFACE_DEBUG
+#endif
+
+#ifdef SKC_SURFACE_DEBUG
+
+#define SKC_SURFACE_WIDTH  4096
+#define SKC_SURFACE_HEIGHT 4096
+
+static
+void
+skc_surface_debug(struct skc_surface_impl * const impl)
+{
+  //
+  // MAP
+  //
+  cl_uchar4 * const rgba = skc_extent_phrN_pdwN_map(&impl->fb,
+                                                    impl->cq,
+                                                    NULL);
+  cl(Finish(impl->cq));
+
+  //
+  // WRITE
+  //
+  FILE* file;
+
+  errno_t ferr = fopen_s(&file,"surface.ppm","wb");
+
+  fprintf(file,"P6\n%u %u\n255\n",SKC_SURFACE_WIDTH,SKC_SURFACE_HEIGHT);
+
+  for (skc_uint ii=0; ii<SKC_SURFACE_HEIGHT*SKC_SURFACE_WIDTH; ii++)
+    fwrite(rgba + ii,sizeof(skc_uchar),3,file); // R,G,B
+
+  ferr = fclose(file);
+
+  //
+  // UNMAP
+  //
+  skc_extent_phrN_pdwN_unmap(&impl->fb,rgba,impl->cq,NULL);
+
+  cl(Flush(impl->cq));
+}
+
+#endif
+
+//
+//
+//
+
+void
+skc_surface_render_complete(struct skc_surface_render * const render)
+{
+#ifdef SKC_SURFACE_DEBUG
+  // write fb out
+  skc_surface_debug(render->impl);
+#endif
+
+  // notify
+  if (render->notify != NULL) {
+    render->notify(render->impl->surface,
+                   render->styling,
+                   render->composition,
+                   render->data);
+  }
+
+  // unlock and release the styling and composition
+  skc_styling_unlock_and_release(render->styling);
+  skc_composition_unlock_and_release(render->composition);
+
+  // grid is now complete
+  skc_grid_complete(render->grid);
+}
+
+static
+void
+skc_surface_render_cb(cl_event event, cl_int status, struct skc_surface_render * const render)
+{
+  SKC_CL_CB(status);
+
+  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
+  SKC_SCHEDULER_SCHEDULE(render->impl->runtime->scheduler,
+                         skc_surface_render_complete,
+                         render);
+}
+
+//
+//
+//
+
+static
+void
+skc_surface_grid_pfn_execute(skc_grid_t const grid)
+{
+  struct skc_surface_render   * const render  = skc_grid_get_data(grid);
+  struct skc_surface_impl     * const impl    = render->impl;
+  struct skc_runtime          * const runtime = impl->runtime;
+
+  // get the composition args
+  struct skc_composition_impl * const ci      = render->composition->impl;
+  struct skc_place_atomics    * const atomics = ci->atomics.hr;
+
+  if (atomics->offsets > 0)
+    {
+      // acquire the rbo
+      cl(EnqueueAcquireGLObjects(impl->cq,1,&render->fb,0,NULL,NULL));
+
+      // get the styling args
+      struct skc_styling_impl * const si = render->styling->impl;
+
+      cl(SetKernelArg(impl->kernels.render,0,SKC_CL_ARG(si->layers.drN)));
+      cl(SetKernelArg(impl->kernels.render,1,SKC_CL_ARG(si->groups.drN)));
+      cl(SetKernelArg(impl->kernels.render,2,SKC_CL_ARG(si->extras.drN)));
+
+      cl(SetKernelArg(impl->kernels.render,3,SKC_CL_ARG(ci->keys.drw)));
+      cl(SetKernelArg(impl->kernels.render,4,SKC_CL_ARG(atomics->keys)));
+      cl(SetKernelArg(impl->kernels.render,5,SKC_CL_ARG(ci->offsets.drw)));
+      cl(SetKernelArg(impl->kernels.render,6,SKC_CL_ARG(atomics->offsets)));
+
+      // block pool
+      cl(SetKernelArg(impl->kernels.render,7,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+
+      // surface
+      cl(SetKernelArg(impl->kernels.render,8,SKC_CL_ARG(render->fb)));
+
+#if 1
+      // tile clip
+      cl(SetKernelArg(impl->kernels.render,9,sizeof(skc_uint4),render->clip));
+#else
+      // surface pitch (height)
+      skc_uint const surface_pitch = SKC_SURFACE_HEIGHT;
+      cl(SetKernelArg(impl->kernels.render,9,SKC_CL_ARG(surface_pitch)));
+      // tile clip
+      cl(SetKernelArg(impl->kernels.render,10,sizeof(skc_uint4),render->clip));
+#endif
+
+      // launch render kernel
+      skc_device_enqueue_kernel(runtime->device,
+                                SKC_DEVICE_KERNEL_ID_RENDER,
+                                impl->cq,
+                                impl->kernels.render,
+                                atomics->offsets,
+                                0,NULL,NULL);
+
+
+      cl_event complete;
+
+      // give the rbo back
+      cl(EnqueueReleaseGLObjects(impl->cq,1,&render->fb,0,NULL,&complete));
+
+      // notify anyone listening...
+      cl(SetEventCallback(complete,CL_COMPLETE,skc_surface_render_cb,render));
+      cl(ReleaseEvent(complete));
+
+      // flush it
+      cl(Flush(impl->cq));
+    }
+  else
+    {
+      skc_surface_render_complete(render);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_surface_pfn_release(struct skc_surface_impl * const impl)
+{
+  if (--impl->surface->ref_count != 0)
+    return;
+
+  //
+  // otherwise, release all resources
+  //
+  
+  // drain the command queue
+  cl(Finish(impl->cq));
+
+  struct skc_runtime * const runtime = impl->runtime;  
+
+  // release the kernel
+  cl(ReleaseKernel(impl->kernels.render));
+
+  // free surface host
+  skc_runtime_host_perm_free(runtime,impl->surface);
+
+  // release the cq
+  skc_runtime_release_cq_in_order(runtime,impl->cq);
+
+  // release fb
+  // skc_extent_phrN_pdwN_free(runtime,&impl->fb);
+  
+  // free surface impl
+  skc_runtime_host_perm_free(runtime,impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_surface_grid_pfn_dispose(skc_grid_t const grid)
+{
+  struct skc_surface_render * const render  = skc_grid_get_data(grid);
+  struct skc_surface_impl   * const impl    = render->impl;
+  struct skc_runtime        * const runtime = impl->runtime;
+
+  // free the render object
+  skc_runtime_host_temp_free(runtime,render,render->id);
+
+  // release the surface
+  skc_surface_pfn_release(impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_surface_pfn_render(struct skc_surface_impl * const impl,
+                       uint32_t                  const clip[4],
+                       skc_styling_t                   styling,
+                       skc_composition_t               composition,
+                       skc_surface_render_pfn_notify   notify,
+                       void                          * data,
+                       void                          * fb)
+{
+  // retain surface
+  skc_surface_retain(impl->surface);
+
+  //
+  // FIXME -- we used to seal the styling and composition objects if
+  // they weren't already.  Either test that they're sealed or seal
+  // them here.
+  //
+
+  // retain and lock the styling and composition 
+  skc_styling_retain_and_lock(styling);
+  skc_composition_retain_and_lock(composition);
+
+  //
+  // allocate a render instance
+  //
+  skc_subbuf_id_t                   id;
+  struct skc_surface_render * const render = skc_runtime_host_temp_alloc(impl->runtime,
+                                                                         SKC_MEM_FLAGS_READ_WRITE,
+                                                                         sizeof(*render),&id,NULL);
+  render->id          = id;
+
+  render->clip[0]     = clip[0];
+  render->clip[1]     = clip[1];
+  render->clip[2]     = clip[2];
+  render->clip[3]     = clip[3];
+
+  render->impl        = impl;
+  render->styling     = styling;
+  render->composition = composition;
+
+  render->notify      = notify;
+  render->data        = data;
+
+  render->fb          = fb;
+
+  render->grid        = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                                             NULL, // invalidation not necessary
+                                             render,
+                                             NULL, // no waiting
+                                             skc_surface_grid_pfn_execute,
+                                             skc_surface_grid_pfn_dispose);
+
+  // declare happens-after relationships
+  skc_grid_happens_after_grid(render->grid,styling->impl->grid);
+  skc_grid_happens_after_grid(render->grid,composition->impl->grids.sort);
+
+  // wait for styling and composition
+  skc_grid_start(render->grid);
+}
+
+//
+//
+//
+
+skc_err
+skc_surface_cl_12_create(struct skc_context   * const context,
+                         struct skc_surface * * const surface)
+{
+  struct skc_runtime * const runtime = context->runtime;
+
+  // allocate surface
+  (*surface) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**surface));
+
+  // allocate impl
+  struct skc_surface_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  // initialize surface
+  // SKC_ASSERT_STATE_INIT((*impl),SKC_SURFACE_STATE_READY);
+
+  (*surface)->context    = context;
+  (*surface)->impl       = impl;
+  (*surface)->ref_count  = 1;
+
+  (*surface)->release    = skc_surface_pfn_release;
+  (*surface)->clear      = skc_surface_pfn_clear;
+  (*surface)->blit       = skc_surface_pfn_blit;
+  (*surface)->render     = skc_surface_pfn_render;
+
+  // intialize impl
+  impl->surface          = *surface;
+  impl->runtime          = runtime;
+
+#if 0
+  // FIXME -- 4K x 4K -- temporarily fixed size
+  size_t const fb_size = sizeof(skc_uchar4) * SKC_SURFACE_WIDTH * SKC_SURFACE_HEIGHT;
+
+  // create framebuffer
+  skc_extent_phrN_pdwN_alloc(runtime,&impl->fb,fb_size);
+#endif
+
+  // acquire a command queue
+  impl->cq               = skc_runtime_acquire_cq_in_order(runtime);
+
+  // acquire kernel
+  impl->kernels.render   = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_RENDER);
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/prefix.cl b/src/compute/skc/prefix.cl
deleted file mode 100644
index 960b6cf5ff..0000000000
--- a/src/compute/skc/prefix.cl
+++ /dev/null
@@ -1,1042 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "raster_builder_cl_12.h"
-#include "block.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "macros.h"
-#include "tile.h"
-
-//
-// INPUT:
-//
-//   TTRK (64-BIT COMPARE)
-//
-//    0                                  63
-//    | TTSB ID |   X  |   Y  | COHORT ID |
-//    +---------+------+------+-----------+
-//    |    27   |  12  |  12  |     13    |
-//
-//
-//   TTRK (32-BIT COMPARE)
-//
-//    0                                        63
-//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
-//    +---------+-----+------+------+-----------+
-//    |    27   |  5  |  12  |  12  |     8     |
-//
-//
-// OUTPUT:
-//
-//   TTSK v2:
-//
-//    0                                     63
-//    | TTSB ID | PREFIX |  N/A |  X |  Y |
-//    +---------+--------+------+----+----+
-//    |    27   | 1 (=0) |  12  | 12 | 12 |
-//
-//
-//   TTPK v1:
-//
-//    0                                        63
-//    | TTPB ID | ALL ZEROES | SPAN |  X  |  Y  |
-//    +---------+------------+------+-----+-----+
-//    |    27   |      1     |  12  | 12  | 12  |
-//
-//
-//   TTPK v2:
-//
-//    0                                       63
-//    | TTPB ID | PREFIX | SPAN |  X  |  Y  |
-//    +---------+--------+------+-----+-----+
-//    |    27   | 1 (=1) |  12  | 12  | 12  |
-//
-
-#define SKC_PREFIX_SUBGROUP_MASK  (SKC_PREFIX_SUBGROUP_SIZE - 1)
-
-//
-// smem accumulator
-//
-
-union skc_subgroup_accum
-{
-  struct {
-    SKC_ATOMIC_INT        ttp[SKC_TILE_HEIGHT];
-  } atomic;
-
-  struct {
-    skc_ttp_t             ttp[SKC_TILE_HEIGHT];
-  } aN;
-
-  struct {
-    SKC_PREFIX_TTP_V      ttp[SKC_PREFIX_SUBGROUP_SIZE];
-  } vN;
-
-  struct {
-    SKC_PREFIX_SMEM_ZERO  ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
-  } zero;
-};
-
-//
-//
-//
-
-struct skc_subgroup_smem
-{
-  // prefix accumulator
-  union skc_subgroup_accum accum;
-};
-
-//
-//
-//
-
-static
-skc_uint
-skc_subgroup_lane()
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  return get_sub_group_local_id();
-#else
-  return 0;
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_PREFIX_TTS_V_BITFIELD
-skc_tts_get_dy(skc_tts_v_t const ttsv)
-{
-  // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
-  SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
-
-  return dy - (~ttsv >> 31);
-}
-
-static
-SKC_PREFIX_TTS_V_BITFIELD
-skc_tts_get_py(skc_tts_v_t const ttsv)
-{
-  return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
-}
-
-//
-//
-//
-
-static
-void
-skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
-{
-  // get "altitude"
-  SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
-
-  // get the y pixel coordinate
-  SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
-
-  //
-  // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
-  //
-  // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
-  //
-
-#if 0
-  if (tts_v != SKC_TTS_INVALID)
-    printf("< %08X = %u : %d >\n",tts_v,py,dy); 
-#endif
-
-  //
-  // scatter-add the "altitude" to accumulator
-  //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-  if (tts_v C != SKC_TTS_INVALID) {                                     \
-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
-  }
-
-#else
-  //
-  // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
-  //
-  // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                 \
-  if (tts_v C == SKC_TTS_INVALID)               \
-    return;                                     \
-  smem->accum.aN.ttp[py C] = dy C;
-#endif
-
-  SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
-}
-
-//
-// The implication here is that if our device configuration has a
-// rectangular 1:2 tile then we need a block size of at least 2
-// subblocks. The subblock size of course needs to match the length of
-// the smallest tile side.
-//
-
-static
-void
-skc_accum_flush(__local struct skc_subgroup_smem * const smem,
-                __global skc_bp_elem_t           * const bp_elems,
-                skc_block_id_t                     const pb_id)
-{
-  // load the ttp elements
-  SKC_PREFIX_TTP_V const ttp_v  = smem->accum.vN.ttp[get_sub_group_local_id()];
-  skc_uint         const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
-  
-#if   ( SKC_TILE_RATIO == 1 )
-
-  bp_elems[offset] = ttp_v;
-
-#elif ( SKC_TILE_RATIO == 2 )
-
-  vstore2(ttp_v,offset,bp_elems);
-
-#else
-
-#error("tile ratio greater than 2 not supported")
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_accum_reset(__local struct skc_subgroup_smem * const smem)
-{
-  for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
-    smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
-}
-
-//
-// get next sk key
-//
-
-static
-skc_ttsk_s_t
-skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
-                    skc_uint     * const sk_next,
-                    skc_int      * const rkpk_rem)
-{
-  // decrement count
-  *rkpk_rem -= 1;
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT with subgroup support is easy
-  //
-  // SIMT without subgroup support can always emulate with smem
-  //
-#if 0
-  //
-  // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
-  // broadcast a uint2 cast to a long. It was probably bad to do this
-  // anyway without a union wrapping the TTSK scalar type.
-  //
-  // Consider creating a union { ulong; uint2 } at a later date --
-  // probably no need to ever do this unless it makes broadcast faster
-  // which is unlikely since it will probably be implemented as 2
-  // 32-bit broadcasts.
-  //
-  // Additionally, the TTRK and TTXK key bitfield sizes are probably
-  // cast in stone and we aren't going to change them no matter
-  // architecture we're on.
-  //
-  skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
-#else
-  skc_ttsk_s_t sk_s;
-
-  sk_s.lo   = sub_group_broadcast(sk_v->lo,*sk_next);
-  sk_s.hi   = sub_group_broadcast(sk_v->hi,*sk_next);
-  *sk_next += 1;
-#endif
-
-#else
-  //
-  // SIMD will always grab component .s0 and then rotate the vector
-  //
-  sk_s = ( sk_v->s0 );
-
-  skc_ttsk_v_rotate_down(sk_v);
-
-#endif
-
-  return sk_s;
-}
-
-//
-//
-//
-
-static
-skc_raster_yx_s
-skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT with subgroup support is easy
-  //
-  // SIMT without subgroup support can always emulate with smem
-  //
-  skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
-
-#else
-  //
-  // SIMD will always grab component .s0 and then rotate the vector
-  //
-  skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
-
-#endif
-
-  return yx_s;
-}
-
-//
-// mask off ttsb id
-//
-
-static
-skc_block_id_s_t
-skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
-{
-  return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
-}
-
-//
-// load tts_v as early as possible
-//
-
-static
-skc_tts_v_t
-skc_load_tts(__global skc_bp_elem_t * const bp_elems,
-             skc_block_id_s_t         const sb_id)
-{
-  return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
-}
-
-//
-// massage ttrk keys into ttsk keys
-//
-
-static
-void
-skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
-{
-  sk_v->lo = sk_v->lo  & SKC_TTXK_LO_MASK_ID;     // clear high (N/A) bits
-  sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
-}
-
-//
-// replenish ttsk keys
-//
-
-static
-void
-skc_ttsk_v_replenish(skc_ttsk_v_t                * const sk_v,
-                     skc_uint                    * const sk_next,
-                     skc_uint                    * const rks_next,
-                     __global skc_ttrk_e_t const * const rks)
-{
-  // if there are still keys available then return
-  if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
-    return;
-
-  //
-  // otherwise, replenish sk_v
-  //
-  // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
-  // divisible by TTXK_V_SIZE and therefore loading some keys from the
-  // next raster is OK.
-  //
-  *sk_next   = 0;
-  *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
-  *sk_v      = rks[*rks_next];
-
-#if 0
-  printf("* %08X ( %3u, %3u )\n",
-         sk_v->hi,
-         (sk_v->hi >> 12) & 0xFFF,
-         (sk_v->hi      ) & 0xFFF);
-#endif
-  
-  skc_ttrk_to_ttsk(sk_v);
-
-#if 0
-  printf("! %08X ( %3u, %3u )\n",
-         sk_v->hi,
-         (sk_v->hi >> 20) & 0xFFF,
-         (sk_v->hi >>  8) & 0xFFF);
-#endif
-}
-
-//
-// replenish block ids
-//
-// note that you can't overrun the block id pool since it's a ring
-//
-
-static
-void
-skc_blocks_replenish(skc_uint                      * const blocks_next,
-                     skc_uint                      * const blocks_idx,
-                     skc_block_id_v_t              * const blocks,
-                     skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
-                     __global skc_block_id_t const * const bp_ids)
-
-{
-  *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
-  *blocks      = bp_ids[*blocks_idx & bp_mask];
-  *blocks_next = 0;
-
-#if 0
-  printf("replenish blocks: %u\n",*blocks);
-#endif
-}
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_blocks_get_next(skc_uint                      * const blocks_next,
-                    skc_uint                      * const blocks_idx,
-                    skc_block_id_v_t              * const blocks,
-                    skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
-                    __global skc_block_id_t const * const bp_ids)
-{
-  // replenish?
-  if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
-    {
-      skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-    }
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
-
-#else
-  //
-  // SIMD
-  //
-  skc_block_id_t id = blocks->s0;
-
-  skc_shuffle_down_1(*blocks);
-
-#endif
-
-  *blocks_next += 1;
-
-  return id;
-}
-
-//
-// subblock allocator
-//
-
-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
-
-static
-skc_block_id_t
-skc_subblocks_get_next_pb_id(skc_block_id_t                * const subblocks,
-                             skc_uint                      * const blocks_next,
-                             skc_uint                      * const blocks_idx,
-                             skc_block_id_v_t              * const blocks,
-                             skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
-                             __global skc_block_id_t const * const bp_ids)
-{
-  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-    {
-      *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-    }
-
-  skc_block_id_t const pb_id = *subblocks;
-
-  *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
-
-  return pb_id;
-}
-
-#endif
-
-//
-// append a ttsk key to the work-in-progress node
-//
-
-static
-void
-skc_node_v_append_sk(skc_ttsk_s_t            const * const sk_s,
-
-                     skc_ttxk_v_t                  * const xk_v,
-                     skc_uint                      * const xk_v_next,
-                     skc_uint                      * const xk_v_idx,
-                     __global skc_bp_elem_t        * const bp_elems,
-
-                     skc_int                         const rkpk_rem,
-
-                     skc_uint                      * const blocks_next,
-                     skc_uint                      * const blocks_idx,
-                     skc_block_id_v_t              * const blocks,
-                     skc_uint                        const bp_mask,
-                     __global skc_block_id_t const * const bp_ids)
-{
-  //
-  // Append an sk key to the in-register xk_v vector
-  //
-  // If the work-in-progress node in gmem will only have room for one
-  // more key then:
-  //
-  //   - if this was the final SK then write out xk_v and exit
-  //
-  //   - otherwise, acquire a block id, link it, write out xk_v,
-  //     prepare new node
-  //
-  // Note that this does *not* try to squeeze in a final key into the
-  // next node slot.  This optimization isn't worth the added
-  // down-pipeline complexity.
-  //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
-    {
-      *xk_v = *sk_s;
-    }
-
-  *xk_v_next += 1;
-
-  // are there more keys coming?
-  if (rkpk_rem > 0)
-    {
-      // is the node almost full?
-      if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
-        {
-          skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-
-          if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
-            {
-              xk_v->lo = id;
-              xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
-            }
-
-          // store xk_v (uint2) to bp (uint)
-          bp_elems[*xk_v_idx                         ] = xk_v->lo;
-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
-          printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
-          // reinitialize xk_v
-          xk_v->lo = SKC_UINT_MAX;
-          xk_v->hi = SKC_UINT_MAX;
-
-          // update node elem idx
-          *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-          // reset node count
-          *xk_v_next = 0;
-        }
-      // is xk_v full?
-      else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
-        {
-          // store xk_v to bp
-          bp_elems[*xk_v_idx                         ] = xk_v->lo;
-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
-          printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
-          // reinitialize xk_v
-          xk_v->lo = SKC_UINT_MAX;
-          xk_v->hi = SKC_UINT_MAX;
-
-          // increment node elem idx
-          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-        }
-    }
-  else
-    {
-      bp_elems[*xk_v_idx                         ] = xk_v->lo;
-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
-      printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
-      while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
-        {
-          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-
-          bp_elems[*xk_v_idx]                          = SKC_UINT_MAX;
-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
-        }
-    }
-
-#else
-  //
-  // SIMD
-  //
-
-#endif
-}
-
-//
-//
-//
-
-static
-skc_ttpk_s_t
-skc_ttpk_create(skc_raster_yx_s const yx_prev,
-                skc_raster_yx_s const yx_next,
-                skc_block_id_t  const pb_id)
-{
-  // - yx_prev is already incremented by one 
-  // - yx_span is already shifted up at hi.x
-  skc_uint const yx_span = yx_next - yx_prev;
-
-  skc_ttpk_s_t pk;
-
-  // turn on prefix bit | shift span bits upward
-  pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
-
-  // shift down high span bits | yx of tile
-  pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("* %08v2X : %u\n",pk,yx_span);
-#endif
-
-  return pk;
-}
-
-//
-// append a ttpk key to the work-in-progress node
-//
-
-static
-void
-skc_node_v_append_pk(skc_ttpk_s_t            const * const pk_s,
-
-                     skc_ttxk_v_t                  * const xk_v,
-                     skc_uint                      * const xk_v_next,
-                     skc_uint                      * const xk_v_idx,
-                     __global skc_bp_elem_t        * const bp_elems,
-
-                     skc_uint                      * const blocks_next,
-                     skc_uint                      * const blocks_idx,
-                     skc_block_id_v_t              * const blocks,
-                     skc_uint                        const bp_mask,
-                     __global skc_block_id_t const * const bp_ids)
-{
-  //
-  // append a pk key to the in-register xk_v vector
-  //
-  // if the work-in-progress node in gmem will only have room for one
-  // more key then:
-  //
-  //   - if this was the final SK then write out xk_v and exit
-  //
-  //   - otherwise, acquire a block id, link it, write out xk_v,
-  //     prepare new node
-  //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
-    {
-      *xk_v = *pk_s;
-    }
-
-  *xk_v_next += 1;
-
-  // is the node almost full?
-  if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
-    {
-      skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-
-      if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
-        {
-          xk_v->lo = id;
-          xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
-        }
-
-      // store xk_v to bp
-      bp_elems[*xk_v_idx                         ] = xk_v->lo;
-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
-      printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
-      // reinitialize xk_v
-      xk_v->lo = SKC_UINT_MAX;
-      xk_v->hi = SKC_UINT_MAX;
-
-      // update node elem idx
-      *xk_v_idx  = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-      // reset node count
-      *xk_v_next = 0;
-    }
-  // is xk_v full?
-  else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
-    {
-      // store xk_v to bp
-      bp_elems[*xk_v_idx                         ] = xk_v->lo;
-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
-      printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
-      // reinitialize xk_v
-      xk_v->lo = SKC_UINT_MAX;
-      xk_v->hi = SKC_UINT_MAX;
-      
-      // increment node elem idx
-      *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-    }
-
-#else
-  //
-  // SIMD
-  //
-#endif
-}
-
-//
-// append the first 3 fields of meta info to the raster header
-//
-
-static
-void
-skc_node_v_init_header(skc_ttxk_v_t                           * const xk_v,
-                       skc_uint                               * const xk_v_next,
-                       union skc_raster_cohort_meta_out const * const meta)
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  if (get_sub_group_local_id() < 2)
-    {
-      *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
-    }
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("header: %08v4X\n",meta->u32v4);
-#endif
-
-  //
-  // increment counter: uint4 + uint4 = uint2 x 4
-  //
-  *xk_v_next = 2 + 2; // +2 for unitialized bounds
-
-#else
-  //
-  // SIMD
-  //
-
-#endif
-}
-
-//
-//
-//
-
-__kernel
-SKC_PREFIX_KERNEL_ATTRIBS
-void
-skc_kernel_prefix(__global skc_uint       const * const bp_atomics,
-                  __global skc_block_id_t const * const bp_ids,
-                  __global skc_bp_elem_t        * const bp_elems,
-                  skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
-                  __global skc_ttrk_e_t   const * const rks,
-                  __global skc_block_id_t       * const map,
-                  __global skc_uint       const * const metas,
-                  skc_uint                        const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem                  smem[1];
-#else
-  __local struct skc_subgroup_smem                  smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
-#endif
-
-  //
-  // where is this subgroup in the grid?
-  //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
-  skc_uint const sgi = get_group_id(0);
-#else
-  skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  skc_uint const sgl = get_sub_group_local_id();
-
-  //
-  // return if this subgroup is excess
-  //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
-  if (sgi >= count)
-    return;
-#endif
-
-  //
-  // get meta info for this subgroup's raster
-  //
-  union skc_raster_cohort_meta_out const meta  = { vload4(sgi,metas) };
-  skc_uint                         const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("%3u : %5u / %5u / %5u / %5u / %u\n",
-           sgi,
-           meta.blocks,
-           meta.offset,
-           meta.nodes,
-           meta.keys,
-           reads);
-#endif
-
-  //
-  // preload blocks -- align on subgroup
-  //
-  skc_uint         blocks_idx  = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
-  skc_block_id_v_t blocks      = bp_ids[blocks_idx & bp_mask];
-  skc_uint         blocks_next = (reads &  SKC_PREFIX_SUBGROUP_MASK);
-
-  //
-  // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
-  //
-  skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-  //
-  // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
-  //
-  skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
-  skc_uint     xk_v_next;
-
-  skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
-
-  //
-  // no keys -- this is an empty raster!
-  //
-  if (meta.keys == 0)
-    {
-      bp_elems[xk_v_idx                         ] = xk_v.lo;
-      bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
-
-      while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
-        {
-          xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-
-          bp_elems[xk_v_idx]                          = SKC_UINT_MAX;
-          bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
-        }
-
-      return;
-    }
-
-  //
-  // load TTRK keys and in-place convert to TTSK keys
-  //
-  skc_uint         rks_next    = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
-  skc_ttsk_v_t     sk_v        = rks[rks_next];
-  skc_uint         sk_next     = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
-  skc_int          rkpk_rem    = meta.keys; // signed count of remaining rk+pk keys
-
-#if 0
-  printf("* %08X ( %3u, %3u )\n",
-         sk_v.hi,
-         (sk_v.hi >> 12) & 0xFFF,
-         (sk_v.hi      ) & 0xFFF);
-#endif
-  
-  skc_ttrk_to_ttsk(&sk_v);
-
-#if 0
-  printf("! %08X ( %3u, %3u )\n",
-         sk_v.hi,
-         (sk_v.hi >> 20) & 0xFFF,
-         (sk_v.hi >>  8) & 0xFFF);
-#endif
-
-  //
-  // subblocks
-  //
-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
-  skc_block_id_t subblocks = 0;
-#endif
-
-  //
-  // begin "scan" of tiles
-  //
-  skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
-
-  //
-  // zero the accumulator
-  //
-  skc_accum_reset(smem);
-
-  while (true)
-    {
-      // get next rk key
-      skc_ttsk_s_t     const sk_s  = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
-
-      // load ttsb id
-      skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
-
-      // load tts_v transaction "in flight" as early as possible
-      skc_tts_v_t      const tts_v = skc_load_tts(bp_elems,sb_id);
-
-#if 0
-      printf("{ %08X }\n",tts_v);
-#endif
-
-#if 0
-      if (get_sub_group_local_id() == 0)
-        printf("[ %d, %X ]\n",rkpk_rem,sb_id);
-#endif
-
-#if 0
-      if (get_sub_group_local_id() == 0)
-        printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
-#endif
-
-      //
-      // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
-      // TIME AND SIMD'IZED
-      //
-
-      // if yx's don't match then we're either issuing a ttpk or
-      // resetting the accumulator
-      if (sk_s.hi != yx_prev)
-        {
-          // if yx_next.y == yx_last.y then x changed
-          if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
-            {
-              //
-              // if the tile is not square then it's ratio is 1:2
-              //
-#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
-              skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
-                                                                        &blocks_next,
-                                                                        &blocks_idx,
-                                                                        &blocks,
-                                                                        bp_mask,
-                                                                        bp_ids);
-#else
-              skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
-                                                               &blocks_idx,
-                                                               &blocks,
-                                                               bp_mask,
-                                                               bp_ids);
-#endif
-
-              // flush accumulated ttp vector to block/subblock at ttpb_id
-              skc_accum_flush(smem,bp_elems,pb_id);
-
-#if 0
-              if (get_sub_group_local_id() == 0)
-                {
-                  printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
-                         pb_id,
-                         (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
-                         (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
-                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
-                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
-                }
-#endif
-
-              //
-              // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
-              //
-              rkpk_rem -= 1;
-
-              // create the pk
-              skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
-
-              // append pk key to xk buffer
-              skc_node_v_append_pk(&pk_s,
-
-                                   &xk_v,
-                                   &xk_v_next,
-                                   &xk_v_idx,
-                                   bp_elems,
-
-                                   &blocks_next,
-                                   &blocks_idx,
-                                   &blocks,
-                                   bp_mask,
-                                   bp_ids);
-            }
-          else if (rkpk_rem > 0) // we're starting a new tile row
-            {
-              skc_accum_reset(smem);
-            }
-        }
-
-      //
-      // append sk key to node_v
-      //
-      // if rkpk_rem is zero then return from kernel
-      //
-      skc_node_v_append_sk(&sk_s,
-
-                           &xk_v,
-                           &xk_v_next,
-                           &xk_v_idx,
-                           bp_elems,
-
-                           rkpk_rem,
-
-                           &blocks_next,
-                           &blocks_idx,
-                           &blocks,
-                           bp_mask,
-                           bp_ids);
-
-      // we're done if no more sk keys
-      if (rkpk_rem == 0)
-        break;
-
-      // move to new tile
-      yx_prev = sk_s.hi;
-
-      // scatter tts values into accumulator
-      skc_accum_scatter(smem,tts_v);
-
-      // replenish sk keys
-      skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/raster_builder_cl_12.c b/src/compute/skc/raster_builder_cl_12.c
deleted file mode 100644
index 33992cbdfb..0000000000
--- a/src/compute/skc/raster_builder_cl_12.c
+++ /dev/null
@@ -1,1349 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-// get rid of these
-#include <stdio.h>
-#include <stdlib.h>
-
-//
-//
-//
-
-#include "hs/cl/hs_cl_launcher.h"
-
-#include "common/cl/assert_cl.h"
-
-#include "context.h"
-#include "grid.h"
-#include "raster.h"
-#include "extent_ring.h"
-#include "raster_builder.h"
-
-#include "tile.h"
-
-#include "config_cl.h"
-#include "runtime_cl_12.h"
-#include "extent_cl_12.h"
-#include "raster_builder_cl_12.h"
-
-//
-// RASTERIZATION SUB-PIPELINE
-// --------------------------
-//
-// Phase 1: expand commands
-//
-// Phase 2: rasterize
-//
-// Phase 3: sort & segment || release paths
-//
-// Phase 4: prefix
-//
-// Phase 5: release rasters
-//
-//                                                      RASTER  COHORT
-//                                                      ==============
-//
-//                      BUILDER                           RASTERIZER                              POST PROCESSING
-//   <----------------------------------------------->  <------------>  <--------------------------------------------------------------------->
-//
-//   fill cmds  transforms  raster clips  path release  rasterize cmds  cohort map  raster release  TTSB  TTSK  cohort atomics  context atomics
-//   ---------  ----------  ------------  ------------  --------------  ----------  --------------  ----  ----  --------------  ---------------
-//      1,2        1,2           1,2           1,2             2            1-4         1,2,3,4       2-4   2-4       2-4            global
-//
-//
-// NOTES: FINE-GRAINED SVM
-// -----------------------
-//
-//   1) In a fine-grained system we know the exact number of
-//      rasterize cmds per segment type before phase 1
-//
-//   2) A raster that's "under construction" shouldn't be rasterized
-//      until it is complete.  This implies that a raster is not part
-//      of a cohort until it is complete.  The raster builder must
-//      handle raster promises being "forced" to completion -- this is
-//      likely the result of composition construction and subsequent
-//      rendering to a surface.
-//
-//   3) The raster cohort rasterizer state retains the fill cmd,
-//      transform, raster clip and path release "ring" extents.
-//
-//   4) The rasterize cmd extent sizes (line, quad, cubic, rational
-//      quad, rational cubic) are known ahead of time.
-//
-//   5) The raster cohort post processor is standalone and retains the
-//      raster_map, cohort atomics, TTSK_RYX extent, and raster
-//      references until complete.
-//
-
-//
-// Notes:
-//
-// - Could have a pipeline stage before expansion count the exact
-//   number of line/quad/cubic commands but the command buffers are
-//   relatively small (64-bit commands * # of path segments).
-//
-
-//                          raster
-//                          cohort atomics path_ids raster_ids transforms clips cmds_fill cmds_l/q/c ttsk_ryx
-//
-//
-// BEGIN                      ^
-//                            |
-//   EXPAND                   |
-//                            |
-//   RASTERIZE                |
-//                            |
-//   SORT || RELEASE PATHS    |
-//                            |
-//   PREFIX                   |
-//                            |
-//   RELEASE RASTERS          |
-//                            |
-// END                        v
-//
-//
-// BEGIN
-//
-//   EXPAND                   -- PRODUCES:   one or more extents of rasterization commands
-//
-//   RASTERIZE                -- DEPENDENCY: requires size of command extents before launching
-//                            -- PRODUCES:   an extent of ttsk_ryx keys
-//
-//   SORT || RELEASE PATHS    -- DEPENDENCY: requires size of key extent before launching
-//                            -- PRODUCES:   sorted array of keys
-//
-//   PREFIX                   -- DEPENDENCY: none -- can execute after SORT because grid size is number of rasters
-//
-//   RELEASE RASTERS          -- DEPENDENCY: none -- can execute after prefix
-//
-// END
-//
-
-// ------------------------
-//
-// DEPENDENCY is cleanly implemented with a host callback or device kernel launcher
-//
-// Can this hide resource acquisition?  Yes.  But there are two cases:
-//
-// 1. acqusition of resources occurs on the host thread and lack of
-//    resources drains the host command queue until resources are
-//    available (OpenCL 2.x)
-//
-// 2. the host commands lazily acquire resources (OpenCL 1.2)
-//
-// ------------------------
-//
-// How to express?
-//
-// Each substage launches its successors.  This supports both dependency models.
-//
-// If OpenCL 1.2 then the substage can't be launched until the prior
-// stage's event is complete.  So this requires registering a callback
-// to invoke the substage.
-//
-// ------------------------
-
-//
-// BUILD
-//
-
-struct skc_raster_builder_impl
-{
-  struct skc_raster_builder    * raster_builder;
-  struct skc_runtime           * runtime;
-
-  skc_grid_t                     cohort;
-
-  // these are all durable/perm extents
-  struct skc_extent_phrwg_thr1s  path_ids;    // read/write by host
-  struct skc_extent_phw1g_tdrNs  transforms;  // write once by host + read by device
-  struct skc_extent_phw1g_tdrNs  clips;       // write once by host + read by device
-  struct skc_extent_phw1g_tdrNs  fill_cmds;   // write once by host + read by device
-  struct skc_extent_phrwg_tdrNs  raster_ids;  // read/write by host + read by device
-
-  struct {
-    cl_kernel                    fills_expand;
-    cl_kernel                    rasterize_all;
-    cl_kernel                    segment;
-    cl_kernel                    rasters_alloc;
-    cl_kernel                    prefix;
-  } kernels;
-};
-
-//
-// RASTER COHORT
-//
-// This sub-pipeline snapshots the raster builder and then acquires
-// and releases host and device resources as necessary (as late as
-// possible).
-//
-// Note that the cohort extents are ephemeral and are only used by one
-// or more stages of a the rasterization sub-pipeline.
-//
-// The pipeline implementation may vary between compute platforms.
-//
-
-struct skc_raster_cohort
-{
-  struct skc_raster_builder_impl    * impl;
-
-  struct skc_extent_phrwg_thr1s_snap  path_ids;    // read/write by host
-  struct skc_extent_phw1g_tdrNs_snap  transforms;  // write once by host + read by device
-  struct skc_extent_phw1g_tdrNs_snap  clips;       // write once by host + read by device
-  struct skc_extent_phw1g_tdrNs_snap  fill_cmds;   // write once by host + read by device
-  struct skc_extent_phrwg_tdrNs_snap  raster_ids;  // read/write by host + read by device
-
-  cl_command_queue                    cq;
-
-  // sub-pipeline atomics
-  struct skc_extent_thr_tdrw          atomics;
-
-  // path primitives are expanded into line/quad/cubic/rational cmds
-  struct skc_extent_tdrw              cmds;
-
-  // rasterization output
-  struct skc_extent_tdrw              keys;
-  // struct skc_extent_thrw_tdrw      keys;
-
-  // post-sort extent with metadata for each raster
-  struct skc_extent_tdrw              metas;
-  // struct skc_extent_thrw_tdrw      metas;
-
-  // subbuf id
-  skc_subbuf_id_t                     id;
-
-  //
-  // pipeline also uses the following global resources:
-  //
-  // - command queue from global factory
-  // - global block pool and its atomics
-  // - global path and raster host id map
-  // - temporary host and device allocations
-  //
-};
-
-//
-// TTRK (64-BIT COMPARE)
-//
-//    0                                  63
-//    | TTSB ID |   X  |   Y  | COHORT ID |
-//    +---------+------+------+-----------+
-//    |    27   |  12  |  12  |     13    |
-//
-//
-// TTRK (32-BIT COMPARE)
-//
-//    0                                        63
-//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
-//    +---------+-----+------+------+-----------+
-//    |    27   |  5  |  12  |  12  |     8     |
-//
-
-//
-// TTRK is sortable intermediate key format for TTSK
-//
-// We're going to use the 32-bit comparison version for now
-//
-
-union skc_ttrk
-{
-  skc_ulong  u64;
-  skc_uint2  u32v2;
-
-  struct {
-    skc_uint block    : SKC_TTXK_LO_BITS_ID;
-    skc_uint na0      : SKC_TTRK_LO_BITS_NA;
-    skc_uint x        : SKC_TTXK_HI_BITS_X;
-    skc_uint y        : SKC_TTXK_HI_BITS_Y;
-    skc_uint cohort   : SKC_TTRK_HI_BITS_COHORT;
-  };
-
-  struct {
-    skc_uint na1;
-    skc_uint yx       : SKC_TTXK_HI_BITS_YX;
-    skc_uint na2      : SKC_TTRK_HI_BITS_COHORT;
-  };
-
-  struct {
-    skc_uint na3;
-    skc_uint na4      : SKC_TTXK_HI_BITS_X;
-    skc_uint cohort_y : SKC_TTRK_HI_BITS_COHORT_Y;
-  };
-};
-
-//
-//
-//
-
-static
-void
-skc_raster_builder_pfn_release(struct skc_raster_builder_impl * const impl)
-{
-  // decrement reference count
-  if (--impl->raster_builder->refcount != 0)
-    return;
-
-  //
-  // otherwise, dispose of the the raster builder and its impl
-  //
-  struct skc_runtime * const runtime = impl->runtime;
-
-  // free the raster builder
-  skc_runtime_host_perm_free(runtime,impl->raster_builder);
-
-  // free durable/perm extents
-  skc_extent_phrwg_thr1s_free(runtime,&impl->path_ids);
-  skc_extent_phw1g_tdrNs_free(runtime,&impl->transforms);
-  skc_extent_phw1g_tdrNs_free(runtime,&impl->clips);
-  skc_extent_phw1g_tdrNs_free(runtime,&impl->fill_cmds);
-  skc_extent_phrwg_tdrNs_free(runtime,&impl->raster_ids);
-
-  // release kernels
-  cl(ReleaseKernel(impl->kernels.fills_expand));
-  cl(ReleaseKernel(impl->kernels.rasterize_all));
-
-#if 0
-  cl(ReleaseKernel(impl->kernels.rasterize_lines));
-  cl(ReleaseKernel(impl->kernels.rasterize_quads));
-  cl(ReleaseKernel(impl->kernels.rasterize_cubics));
-#endif
-
-  cl(ReleaseKernel(impl->kernels.segment));
-  cl(ReleaseKernel(impl->kernels.rasters_alloc));
-  cl(ReleaseKernel(impl->kernels.prefix));
-
-  // free the impl
-  skc_runtime_host_perm_free(runtime,impl);
-}
-
-//
-//
-//
-
-static 
-void
-skc_raster_builder_rasters_release(struct skc_runtime * const runtime,
-                                   skc_raster_t const * const rasters,
-                                   skc_uint             const size,
-                                   skc_uint             const from,
-                                   skc_uint             const to)
-{
-  if (from <= to) // no wrap
-    {
-      skc_raster_t const * rasters_from = rasters + from;
-      skc_uint             count_from   = to      - from;
-
-      skc_grid_deps_unmap(runtime->deps,rasters_from,count_from);
-      skc_runtime_raster_device_release(runtime,rasters_from,count_from);
-    } 
-  else // from > to implies wrap
-    {
-      skc_raster_t const * rasters_lo = rasters + from;
-      skc_uint             count_lo   = size    - from;
-      
-      skc_grid_deps_unmap(runtime->deps,rasters_lo,count_lo);
-      skc_runtime_raster_device_release(runtime,rasters_lo,count_lo);
-
-      skc_grid_deps_unmap(runtime->deps,rasters,to);
-      skc_runtime_raster_device_release(runtime,rasters,to);
-    }
-}
-
-static 
-void
-skc_raster_builder_paths_release(struct skc_runtime                 * const runtime,
-                                 struct skc_extent_phrwg_thr1s_snap * const snap)
-{
-  // release lo
-  skc_runtime_path_device_release(runtime,snap->hr1.lo,snap->count.lo);
-
-  // release hi
-  if (snap->count.hi)
-    skc_runtime_path_device_release(runtime,snap->hr1.hi,snap->count.hi);
-}
-
-static
-void
-skc_raster_builder_cohort_grid_pfn_dispose(skc_grid_t const grid)
-{
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          -
-  // raster_ids        a
-  // transforms        -
-  // clips             -
-  // fill_cmds         -
-  // cq                a
-  // cohort atomics    a
-  // cmds              -
-  // keys              a
-  // meta              a
-  //
-
-  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
-  struct skc_raster_builder_impl * const impl    = cohort->impl;
-  struct skc_runtime             * const runtime = impl->runtime;
-
-  //
-  // release paths -- FIXME -- Note that releasing paths can be
-  // performed after rasterization is complete
-  //
-
-  // snap alloc the paths -- this host snap simply sets up pointers
-  skc_extent_phrwg_thr1s_snap_alloc(runtime,&impl->path_ids,&cohort->path_ids);
-  
-  // unmap and release raster ids
-  skc_raster_builder_paths_release(runtime,&cohort->path_ids);
-
-  // release path ids
-  skc_extent_phrwg_thr1s_snap_free(runtime,&cohort->path_ids);
-
-  //
-  // release rasters
-  //
-  skc_uint const size = cohort->raster_ids.snap->ring->size.pow2;
-  skc_uint const from = skc_extent_ring_snap_from(cohort->raster_ids.snap);
-  skc_uint const to   = skc_extent_ring_snap_to(cohort->raster_ids.snap);
-
-  // unmap and release raster ids
-  skc_raster_builder_rasters_release(runtime,impl->raster_ids.hrw,size,from,to);
-
-  // release cohort's remaining allocated resources 
-  skc_extent_phrwg_tdrNs_snap_free(runtime,&cohort->raster_ids);
-  skc_runtime_release_cq_in_order(runtime,cohort->cq);
-  skc_extent_thr_tdrw_free(runtime,&cohort->atomics);
-  skc_extent_tdrw_free(runtime,&cohort->keys);
-  skc_extent_tdrw_free(runtime,&cohort->metas);
-  // skc_extent_thrw_tdrw_free(runtime,&cohort->keys);
-  // skc_extent_thrw_tdrw_free(runtime,&cohort->metas);
-  skc_runtime_host_temp_free(runtime,cohort,cohort->id);
-
-  // release the raster builder
-  skc_raster_builder_pfn_release(impl);
-
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          -
-  // raster_ids        -
-  // transforms        -
-  // clips             -
-  // fill_cmds         -
-  // cq                -
-  // cohort atomics    -
-  // cmds              -
-  // keys              -
-  // meta              -
-  //
-}
-
-//
-//
-//
-
-static
-void
-skc_raster_cohort_prefix_release(skc_grid_t const grid)
-{
-  // FIXME -- note that pfn_dispose can be accomplished here
-
-  // release the grid
-  skc_grid_complete(grid); 
-}
-
-static
-void
-skc_raster_cohort_prefix_cb(cl_event event, cl_int status, skc_grid_t const grid)
-{
-  SKC_CL_CB(status);
-  
-  struct skc_raster_cohort * const cohort    = skc_grid_get_data(grid);
-  struct skc_scheduler     * const scheduler = cohort->impl->runtime->scheduler;
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(scheduler,skc_raster_cohort_prefix_release,grid);
-}
-
-//
-//
-//
-
-#if 0
-static
-int cmp64(const void * ptr_a, const void * ptr_b)
-{
-  skc_ulong const a = *(const skc_ulong *)ptr_a;
-  skc_ulong const b = *(const skc_ulong *)ptr_b;
-
-  if (a < b) return -1;
-  if (a > b) return +1;
-  else       return  0;
-}
-#endif
-
-//
-//
-//
-
-static
-void
-skc_raster_cohort_sort_prefix(skc_grid_t const grid)
-{
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        a
-  // clips             a
-  // fill_cmds         -
-  // cq                a
-  // cohort atomics    a
-  // cmds              a
-  // keys              a
-  // meta              -
-  //
-
-  // use the backpointers
-  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
-  struct skc_raster_builder_impl * const impl    = cohort->impl;
-  struct skc_runtime             * const runtime = impl->runtime;
-
-  // release transforms
-  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->transforms);
-
-  // release clips
-  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->clips);
-
-  // release expanded cmds
-  skc_extent_tdrw_free(runtime,&cohort->cmds);
-
-  // alloc the snapshost -- could be zero-sized
-  skc_extent_phrwg_tdrNs_snap_alloc(runtime,
-                                    &impl->raster_ids,
-                                    &cohort->raster_ids,
-                                    cohort->cq,NULL);
-
-  // will never be zero
-  skc_uint const rasters = skc_extent_ring_snap_count(cohort->raster_ids.snap);
-
-  // acquire fixed-size device-side extent
-  skc_extent_tdrw_alloc(runtime,
-                        &cohort->metas,
-                        sizeof(struct skc_raster_cohort_meta));
-
-  // skc_extent_thrw_tdrw_alloc(runtime,
-  //                            &cohort->metas,
-  //                            sizeof(struct skc_raster_cohort_meta));
-
-  // zero the metas
-  skc_extent_tdrw_zero(&cohort->metas,cohort->cq,NULL);
-
-  // get the read-only host copy of the device atomics
-  struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr;
-
-  //
-  // SORT
-  //
-  if (atomics->keys > 0)
-    {
-#ifndef NDEBUG
-      fprintf(stderr,"raster cohort sort: %u\n",atomics->keys);
-#endif
-
-      //
-      //
-      //
-      uint32_t keys_padded_in, keys_padded_out;
-
-      hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
-
-      hs_sort(cohort->cq,
-              cohort->keys.drw,
-              cohort->keys.drw,
-              atomics->keys,
-              keys_padded_in,
-              keys_padded_out,
-              false);
-
-      cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw)));
-      cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw)));
-
-#ifndef NDEBUG
-      fprintf(stderr,"post-sort\n");
-#endif
-
-      // find start of each tile
-      skc_device_enqueue_kernel(runtime->device,
-                                SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK,
-                                cohort->cq,
-                                impl->kernels.segment,
-                                atomics->keys,
-                                0,NULL,NULL);
-
-#ifndef NDEBUG
-      fprintf(stderr,"post-segment\n");
-#endif
-
-      //
-      // DELETE ALL THIS WHEN READY
-      //
-
-#if 0    
-      //
-      //
-      //
-      cl(Finish(cohort->cq));
-
-      // map keys to host
-      union skc_ttrk * const keys = skc_extent_thrw_tdrw_map(&cohort->keys,
-                                                             cohort->cq,
-                                                             NULL);
-      // map meta to host
-      struct skc_raster_cohort_meta * const metas = skc_extent_thrw_tdrw_map(&cohort->metas,
-                                                                             cohort->cq,
-                                                                             NULL);
-      // block until done
-      cl(Finish(cohort->cq));
-
-      // sort keys
-      qsort(keys,atomics->keys,sizeof(*keys),cmp64);
-
-      // mask to determine if rk id is a new block
-      skc_uint const subblock_mask = runtime->config->block.subblocks - 1;
-
-      //
-      // some counters
-      //
-      union skc_raster_cohort_meta_in meta_in = { 
-        .blocks = 0,
-        .offset = 0,
-        .pk     = 0,
-        .rk     = 0
-      };
-
-      // get first key
-      union skc_ttrk curr = keys[0];
-
-      skc_uint ii=0, jj=0;
-
-      // for all TTRK keys
-      while (true)
-        {
-          // increment ttrk count
-          meta_in.rk += 1;
-
-          // was this a new block?
-          if ((curr.u32v2.lo & subblock_mask) == 0)
-            meta_in.blocks += 1;
-
-          // break if we're out of keys
-          if (++ii >= atomics->keys)
-            break;
-
-          // otherwise, process next key
-          union skc_ttrk const next = keys[ii];
-
-          // if new cohort then save curr meta and init next meta
-          if (next.cohort != curr.cohort)
-            {
-              fprintf(stderr,"[ %u, %u, %u, %u ]\n",
-                      meta_in.blocks,
-                      meta_in.offset,
-                      meta_in.pk,
-                      meta_in.rk);
-
-              // store back to buffer
-              metas->inout[curr.cohort].in = meta_in;
-            
-              // update meta_in
-              meta_in.blocks = 0;
-              meta_in.offset = ii; 
-              meta_in.pk     = 0;
-              meta_in.rk     = 0;
-            }
-          // otherwise, if same y but new x then increment TTPK count
-          else if ((next.y == curr.y) && (next.x != curr.x))
-            {
-              meta_in.pk += 1;
-
-#if 0
-              fprintf(stderr,"%3u : %3u : ( %3u, %3u ) -> ( %3u )\n",
-                      jj++,curr.cohort,curr.y,curr.x,next.x);
-#endif
-            }
-
-#if 0
-          fprintf(stderr,"( %3u, %3u )\n",next.y,next.x);
-#endif
-
-          curr = next;
-        }
-
-      fprintf(stderr,"[ %u, %u, %u, %u ]\n",
-              meta_in.blocks,
-              meta_in.offset,
-              meta_in.pk,
-              meta_in.rk);
-
-      // store back to buffer
-      metas->inout[curr.cohort].in = meta_in;
-
-    
-      // unmap
-      skc_extent_thrw_tdrw_unmap(&cohort->keys,
-                                 keys,
-                                 cohort->cq,
-                                 NULL);
-
-      // unmap
-      skc_extent_thrw_tdrw_unmap(&cohort->metas,
-                                 metas,
-                                 cohort->cq,
-                                 NULL);
-#endif
-    }
-
-#ifndef NDEBUG
-  fprintf(stderr,"rasters_alloc: %u\n",rasters);
-#endif
-
-  //
-  // RASTER ALLOC/INIT
-  //
-  cl(SetKernelArg(impl->kernels.rasters_alloc,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,1,SKC_CL_ARG(runtime->block_pool.ids.drw)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,2,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,3,SKC_CL_ARG(runtime->handle_pool.map.drw)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,4,SKC_CL_ARG(cohort->metas.drw)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,5,SKC_CL_ARG(cohort->raster_ids.drN)));
-  cl(SetKernelArg(impl->kernels.rasters_alloc,6,SKC_CL_ARG(rasters)));
-
-  skc_device_enqueue_kernel(runtime->device,
-                            SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC,
-                            cohort->cq,
-                            impl->kernels.rasters_alloc,
-                            rasters,
-                            0,NULL,NULL);
-
-#ifndef NDEBUG
-  fprintf(stderr,"post-alloc\n");
-#endif
-
-  //
-  // PREFIX
-  //
-  cl(SetKernelArg(impl->kernels.prefix,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
-  cl(SetKernelArg(impl->kernels.prefix,1,SKC_CL_ARG(runtime->block_pool.ids.drw)));
-  cl(SetKernelArg(impl->kernels.prefix,2,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
-  cl(SetKernelArg(impl->kernels.prefix,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
-
-  cl(SetKernelArg(impl->kernels.prefix,4,SKC_CL_ARG(cohort->keys.drw)));
-  cl(SetKernelArg(impl->kernels.prefix,5,SKC_CL_ARG(runtime->handle_pool.map.drw)));
-
-  cl(SetKernelArg(impl->kernels.prefix,6,SKC_CL_ARG(cohort->metas.drw)));
-  cl(SetKernelArg(impl->kernels.prefix,7,SKC_CL_ARG(rasters)));
-
-  cl_event complete;
-
-  skc_device_enqueue_kernel(runtime->device,
-                            SKC_DEVICE_KERNEL_ID_PREFIX,
-                            cohort->cq,
-                            impl->kernels.prefix,
-                            rasters,
-                            0,NULL,
-                            &complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_prefix_cb,grid));
-  cl(ReleaseEvent(complete));
-
-#ifndef NDEBUG
-  fprintf(stderr,"post-prefix\n");
-#endif
-
-  // flush command queue
-  cl(Flush(cohort->cq));
-
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          a
-  // raster_ids        a
-  // transforms        -
-  // clips             -
-  // fill_cmds         -
-  // cq                a
-  // cohort atomics    a
-  // cmds              -
-  // keys              a
-  // meta              a
-  //
-}
-
-static
-void
-skc_raster_cohort_rasterize_cb(cl_event event, cl_int status, skc_grid_t const grid)
-{
-  SKC_CL_CB(status);
-  
-  struct skc_raster_cohort * const cohort = skc_grid_get_data(grid);
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_sort_prefix,grid);
-}
-
-static
-void
-skc_raster_cohort_rasterize(skc_grid_t const grid)
-{
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        i
-  // clips             i
-  // fill_cmds         s
-  // cq                a
-  // cohort atomics    a
-  // cmds              a
-  // cmds_quad         a
-  // cmds_cubic        a
-  // keys              -
-  // meta              -
-
-  // use the backpointers
-  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
-  struct skc_raster_builder_impl * const impl    = cohort->impl;
-  struct skc_runtime             * const runtime = impl->runtime;
-
-  //
-  // RELEASED RESOURCES
-  //
-  // cmds       snap
-  //
-
-  // release the cmds extent and snap since it's only used by the expand stage
-  skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->fill_cmds);
-
-  //
-  // NEW ALLOCATED RESOURCES
-  //
-  // transforms snap
-  // clips snap
-  // ttrk keys
-  //
-  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
-                                    &impl->transforms,
-                                    &cohort->transforms,
-                                    cohort->cq,NULL);
-
-  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
-                                    &impl->clips,
-                                    &cohort->clips,
-                                    cohort->cq,NULL);
-
-  // acquire device-side extent
-  skc_extent_tdrw_alloc(runtime,
-                        &cohort->keys,
-                        sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys);
-
-  // skc_extent_thrw_tdrw_alloc(runtime,
-  //                            &cohort->keys,
-  //                            sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys);
-
-  //
-  // acquire out-of-order command queue
-  //
-  // and launch up to 3 kernels
-  //
-  // for each kernel:
-  //
-  //   set runtime "global" kernel args:
-  //
-  //   - block pool atomics
-  //   - block pool extent
-  //
-  //   set cohort "local" kernel args:
-  //
-  //   - atomics
-  //   - cmds
-  //
-  // enqueue barrier
-  // enqueue copy back of atomics on the command queue
-  // set callback on copy back event
-  // release command queue
-  //
-  struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr;
-
-  if (atomics->cmds > 0)
-    {
-      cl(SetKernelArg(impl->kernels.rasterize_all,0,SKC_CL_ARG(runtime->block_pool.atomics.drw)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,1,SKC_CL_ARG(runtime->block_pool.blocks.drw)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,2,SKC_CL_ARG(runtime->block_pool.ids.drw)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask)));
-
-      cl(SetKernelArg(impl->kernels.rasterize_all,4,SKC_CL_ARG(cohort->atomics.drw)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,5,SKC_CL_ARG(cohort->keys.drw)));
-
-      cl(SetKernelArg(impl->kernels.rasterize_all,6,SKC_CL_ARG(cohort->transforms.drN)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,7,SKC_CL_ARG(cohort->clips.drN)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,8,SKC_CL_ARG(cohort->cmds.drw)));
-      cl(SetKernelArg(impl->kernels.rasterize_all,9,SKC_CL_ARG(atomics->cmds)));
-
-      skc_device_enqueue_kernel(runtime->device,
-                                SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL,
-                                cohort->cq,
-                                impl->kernels.rasterize_all,
-                                atomics->cmds,
-                                0,NULL,NULL);
-    }
-
-  //
-  // copyback number of TTSK keys
-  //
-  cl_event complete;
-
-  skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_rasterize_cb,grid));
-  cl(ReleaseEvent(complete));
-
-  // flush command queue
-  cl(Flush(cohort->cq));
-
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        a
-  // clips             a
-  // fill_cmds         -
-  // cq                a
-  // cohort atomics    a
-  // cmds              a
-  // keys              a
-  // meta              -
-}
-
-static
-void
-skc_raster_cohort_fills_expand_cb(cl_event event, cl_int status, skc_grid_t const grid)
-{
-  SKC_CL_CB(status);
-
-  struct skc_raster_cohort * const cohort = skc_grid_get_data(grid);
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_rasterize,grid);
-}
-
-static
-void
-skc_raster_builder_cohort_grid_pfn_execute(skc_grid_t const grid)
-{
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        i
-  // clips             i
-  // fill_cmds         i
-  // cq                -
-  // cohort atomics    -
-  // cmds              -
-  // keys              -
-  // meta              -
-  //
-
-  // allocate the cohort
-  struct skc_raster_cohort       * const cohort  = skc_grid_get_data(grid);
-
-  // get impl
-  struct skc_raster_builder_impl * const impl    = cohort->impl;
-  struct skc_runtime             * const runtime = impl->runtime;
-
-  // acquire in-order cq
-  cohort->cq = skc_runtime_acquire_cq_in_order(runtime);
-
-  // alloc the snapshot -- could be zero-sized
-  skc_extent_phw1g_tdrNs_snap_alloc(runtime,
-                                    &impl->fill_cmds,
-                                    &cohort->fill_cmds,
-                                    cohort->cq,NULL);
-
-  // flush the cq to get the fill running
-  // cl(Flush(cohort->cq));
-
-  // create split atomics
-  skc_extent_thr_tdrw_alloc(runtime,&cohort->atomics,sizeof(struct skc_raster_cohort_atomic));
-
-  // zero the atomics
-  skc_extent_thr_tdrw_zero(&cohort->atomics,cohort->cq,NULL);
-
-  // get config
-  struct skc_config const * const config = runtime->config;
-
-  // acquire device-side extents
-  skc_extent_tdrw_alloc(runtime,
-                        &cohort->cmds,
-                        sizeof(union skc_cmd_rasterize) * config->raster_cohort.expand.cmds);
-
-  //
-  // FILLS EXPAND
-  //
-  // need result of cmd counts before launching RASTERIZE grids
-  //
-  // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host
-  // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device
-  // - or launch a device-wide grid that feeds itself but that's unsatisfying
-  //
-
-  // how many commands?  could be zero
-  skc_uint const work_size = skc_extent_ring_snap_count(cohort->fill_cmds.snap);
-
-  if (work_size > 0)
-    {
-      cl(SetKernelArg(impl->kernels.fills_expand,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
-      cl(SetKernelArg(impl->kernels.fills_expand,1,SKC_CL_ARG(cohort->atomics.drw)));
-      cl(SetKernelArg(impl->kernels.fills_expand,2,SKC_CL_ARG(runtime->handle_pool.map.drw)));
-      cl(SetKernelArg(impl->kernels.fills_expand,3,SKC_CL_ARG(cohort->fill_cmds.drN)));
-      cl(SetKernelArg(impl->kernels.fills_expand,4,SKC_CL_ARG(cohort->cmds.drw)));
-
-      skc_device_enqueue_kernel(runtime->device,
-                                SKC_DEVICE_KERNEL_ID_FILLS_EXPAND,
-                                cohort->cq,
-                                impl->kernels.fills_expand,
-                                work_size,
-                                0,NULL,NULL);
-    }
-
-  //
-  // copyback number of rasterization commands
-  //
-  cl_event complete;
-
-  skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_fills_expand_cb,grid));
-  cl(ReleaseEvent(complete));
-
-  // flush command queue
-  cl(Flush(cohort->cq));
-
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        i
-  // clips             i
-  // fill_cmds         s
-  // cq                a
-  // cohort atomics    a
-  // cmds              a
-  // keys              -
-  // meta              -
-  //
-}
-
-//
-// move grid into waiting state
-//
-// this entails allocating a cohort from the temporary extent
-//
-
-static
-void
-skc_raster_builder_cohort_grid_pfn_waiting(skc_grid_t const grid)
-{
-  // get the impl
-  struct skc_raster_builder_impl * const impl    = skc_grid_get_data(grid);
-  struct skc_runtime             * const runtime = impl->runtime;
-
-  // retain the raster builder
-  impl->raster_builder->refcount += 1;
-
-  // allocate the ephemeral/temp cohort
-  skc_subbuf_id_t id;
-
-  struct skc_raster_cohort * const cohort = 
-    skc_runtime_host_temp_alloc(runtime,
-                                SKC_MEM_FLAGS_READ_WRITE,
-                                sizeof(*cohort),
-                                &id,
-                                NULL);
-
-  // save the id and backpointer
-  cohort->id   = id;
-  cohort->impl = impl;
-
-  // set grid data -- replaces impl
-  skc_grid_set_data(grid,cohort);
-
-  //
-  // ACQUIRE RESOURCES FOR THE COHORT
-  //
-
-  struct skc_raster_builder * const raster_builder = impl->raster_builder;
-
-  // immediately take snapshots of all rings -- these are very inexpensive operations
-  skc_extent_phrwg_thr1s_snap_init(runtime,&raster_builder->path_ids  .ring,&cohort->path_ids);
-  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->transforms.ring,&cohort->transforms);
-  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->clips     .ring,&cohort->clips);
-  skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->fill_cmds .ring,&cohort->fill_cmds);
-  skc_extent_phrwg_tdrNs_snap_init(runtime,&raster_builder->raster_ids.ring,&cohort->raster_ids);
-
-  //
-  // ALLOCATED RESOURCES
-  //
-  // path_ids          i
-  // raster_ids        i
-  // transforms        i
-  // clips             i
-  // fill_cmds         i
-  // cq                -
-  // cohort atomics    -
-  // cmds              -
-  // keys              -
-  // meta              -
-  //
-}
-
-//
-//
-//
-
-static
-void
-skc_raster_builder_cohort_create(struct skc_raster_builder_impl * const impl)
-{
-  // attach a grid
-  impl->cohort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
-                                      &impl->cohort,
-                                      impl,
-                                      skc_raster_builder_cohort_grid_pfn_waiting,
-                                      skc_raster_builder_cohort_grid_pfn_execute,
-                                      skc_raster_builder_cohort_grid_pfn_dispose);
-}
-
-//
-//
-//
-
-static
-skc_err
-skc_raster_builder_pfn_add(struct skc_raster_builder_impl * const impl, 
-                           skc_path_t               const *       paths,
-                           skc_uint                               count)
-{
-  // validate and retain the path
-  skc_err err;
-
-  err = skc_runtime_handle_device_validate_retain(impl->runtime,
-                                                  SKC_TYPED_HANDLE_TYPE_IS_PATH,
-                                                  paths,
-                                                  count);
-
-  if (err)
-    return err;
-
-  skc_runtime_handle_device_retain(impl->runtime,paths,count);
-
-  // make sure there is a grid
-  if (impl->cohort == NULL) {
-    skc_raster_builder_cohort_create(impl);
-  }
-
-  // declare rasterization grid happens after path
-  while (count-- > 0)
-    skc_grid_happens_after_handle(impl->cohort,SKC_TYPED_HANDLE_TO_HANDLE(*paths++));
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-static
-void
-skc_raster_builder_pfn_end(struct skc_raster_builder_impl * const impl, skc_raster_t * const raster)
-{
-  //
-  // acquire host-managed path raster handle and bump reference count
-  // to 2 handles will be released (reduced to 1) once the rasters are
-  // completely rasterized
-  //
-  *raster = skc_runtime_handle_device_acquire(impl->runtime);
-
-  // make sure there is a grid
-  if (impl->cohort == NULL) {
-    skc_raster_builder_cohort_create(impl);
-  }
-
-  // map a handle to a grid
-  skc_grid_map(impl->cohort,*raster);
-}
-
-//
-// snapshot the ring and lazily start the grid
-//
-// FIXME -- might want to revisit this and settle on an even more
-// opaque implementation.  Some options:
-//
-//  - never let the SKC API expose a forced grid start
-//  - make snapshots kick off a forced grid start
-//  - be lazy all the time everywhere
-//
-
-static
-void
-skc_raster_builder_pfn_start(struct skc_raster_builder_impl * const impl)
-{
-  skc_grid_t const cohort = impl->cohort;
-
-  if (cohort != NULL) {
-    skc_grid_start(cohort);
-  }
-}
-
-//
-// NOTE: THIS MIGHT BE REMOVED
-//
-
-static
-void
-skc_raster_builder_pfn_force(struct skc_raster_builder_impl * const impl)
-{
-  skc_grid_t const cohort = impl->cohort;
-
-  if (cohort != NULL) {
-    skc_grid_force(cohort);
-  }
-}
-
-//
-//
-//
-
-skc_err
-skc_raster_builder_cl_12_create(struct skc_context          * const context,
-                                struct skc_raster_builder * * const raster_builder)
-{
-  struct skc_runtime * const runtime = context->runtime;
-
-  // allocate raster builder
-  (*raster_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**raster_builder));
-
-  // refcount
-  (*raster_builder)->refcount = 1;
-
-  // state
-  SKC_ASSERT_STATE_INIT((*raster_builder),SKC_RASTER_BUILDER_STATE_READY);
-
-  // allocate runtime raster builder
-  struct skc_raster_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
-
-  // save the impl
-  (*raster_builder)->impl = impl;
-
-  // intialize impl
-  impl->raster_builder = (*raster_builder);
-  impl->runtime        = runtime;
-  impl->cohort         = NULL;
-
-  // get config
-  struct skc_config const * const config = runtime->config;
-
-  skc_extent_phrwg_thr1s_alloc(runtime,&impl->path_ids  ,sizeof(skc_path_t         ) * config->raster_cohort.path_ids  .elem_count);
-  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->transforms,sizeof(union skc_transform) * config->raster_cohort.transforms.elem_count);
-  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->clips     ,sizeof(union skc_path_clip) * config->raster_cohort.clips     .elem_count);
-  skc_extent_phw1g_tdrNs_alloc(runtime,&impl->fill_cmds ,sizeof(union skc_cmd_fill ) * config->raster_cohort.fill      .elem_count);
-  skc_extent_phrwg_tdrNs_alloc(runtime,&impl->raster_ids,sizeof(skc_raster_t       ) * config->raster_cohort.raster_ids.elem_count);
-
-  // retain the context
-  //skc_context_retain(context);
-
-  (*raster_builder)->context = context;
-
-  (*raster_builder)->add     = skc_raster_builder_pfn_add;
-  (*raster_builder)->end     = skc_raster_builder_pfn_end;
-  (*raster_builder)->start   = skc_raster_builder_pfn_start;
-  (*raster_builder)->force   = skc_raster_builder_pfn_force;
-  (*raster_builder)->release = skc_raster_builder_pfn_release;
-
-  // initialize raster builder with host-writable buffers
-  (*raster_builder)->path_ids  .extent = impl->path_ids.hrw;
-  (*raster_builder)->transforms.extent = impl->transforms.hw1;
-  (*raster_builder)->clips     .extent = impl->clips.hw1;
-  (*raster_builder)->fill_cmds .extent = impl->fill_cmds.hw1;
-  (*raster_builder)->raster_ids.extent = impl->raster_ids.hrw;
-
-  //
-  // the rings perform bookkeeping on the extents
-  //
-  // the ring snapshotting and checkpointing are necessary because
-  // another part of the API can _force_ the raster cohort to flush
-  // its work-in-progress commands but only up to a checkpointed
-  // boundary
-  //
-  skc_extent_ring_init(&(*raster_builder)->path_ids.ring,
-                       config->raster_cohort.path_ids.elem_count,
-                       config->raster_cohort.path_ids.snap_count,
-                       sizeof(skc_path_t));
-
-  skc_extent_ring_init(&(*raster_builder)->transforms.ring,
-                       config->raster_cohort.transforms.elem_count,
-                       config->raster_cohort.transforms.snap_count,
-                       sizeof(union skc_transform));
-
-  skc_extent_ring_init(&(*raster_builder)->clips.ring,
-                       config->raster_cohort.clips.elem_count,
-                       config->raster_cohort.clips.snap_count,
-                       sizeof(union skc_path_clip));
-
-  skc_extent_ring_init(&(*raster_builder)->fill_cmds.ring,
-                       config->raster_cohort.fill.elem_count,
-                       config->raster_cohort.fill.snap_count,
-                       sizeof(union skc_cmd_fill));
-
-  skc_extent_ring_init(&(*raster_builder)->raster_ids.ring,
-                       config->raster_cohort.raster_ids.elem_count,
-                       config->raster_cohort.raster_ids.snap_count,
-                       sizeof(skc_raster_t));
-
-  //
-  // acquire kernels
-  //
-  impl->kernels.fills_expand     = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_FILLS_EXPAND);
-  impl->kernels.rasterize_all    = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL);  
-
-#if 0
-  impl->kernels.rasterize_lines  = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES);
-  impl->kernels.rasterize_quads  = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS);
-  impl->kernels.rasterize_cubics = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS);
-#endif
-
-  impl->kernels.segment          = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK);
-  impl->kernels.rasters_alloc    = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC);
-  impl->kernels.prefix           = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PREFIX);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/raster_builder_cl_12.h b/src/compute/skc/raster_builder_cl_12.h
deleted file mode 100644
index f6e1751ef1..0000000000
--- a/src/compute/skc/raster_builder_cl_12.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_RASTER_BUILDER_CL_12_ONCE
-#define SKC_RASTER_BUILDER_CL_12_ONCE
-
-//
-//
-//
-
-#include "types.h"
-#include "macros.h"
-#include "common.h"
-
-//
-// FIXME -- these magic numbers will be replaced with tile.h constants
-// although they're probably universal across all devices
-//
-// FIXME -- NEED TO EVALUATE IF THIS DISTRIBUTION OF BITS IS GOING TO
-// BE TOO SMALL -- plenty of room to jiggle these bits
-//
-
-#define SKC_CMD_RASTERIZE_BITS_TRANSFORM  12
-#define SKC_CMD_RASTERIZE_BITS_CLIP       12
-#define SKC_CMD_RASTERIZE_BITS_COHORT      8
-
-SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_TRANSFORM == SKC_CMD_FILL_BITS_TRANSFORM);
-SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_CLIP      == SKC_CMD_FILL_BITS_CLIP);
-SKC_STATIC_ASSERT(SKC_CMD_RASTERIZE_BITS_COHORT    == SKC_CMD_FILL_BITS_COHORT);
-
-//
-// device-side rasterization cmd
-//
-
-union skc_cmd_rasterize
-{
-  skc_ulong    u64;
-
-  skc_uint2    u32v2;
-
-  struct {
-    //
-    // Unlike anywhere else in the pipeline, the nodeword index points
-    // "inside" of a path node (with word resolution). This means
-    // there is up to 16 GB of 32-bit word addressing in a unified
-    // block pool:
-    //
-    // "16GB ought to be enough for anyone" -- ASM 5/30/17
-    //
-    skc_uint   nodeword;
-#if defined(__OPENCL_C_VERSION__)
-    skc_uint   tcc;
-#else
-    skc_uint   transform : SKC_CMD_RASTERIZE_BITS_TRANSFORM;
-    skc_uint   clip      : SKC_CMD_RASTERIZE_BITS_CLIP;
-    skc_uint   cohort    : SKC_CMD_RASTERIZE_BITS_COHORT;
-#endif
-  };
-};
-
-SKC_STATIC_ASSERT(sizeof(union skc_cmd_rasterize) == sizeof(skc_uint2));
-
-//
-//
-//
-
-#define SKC_CMD_RASTERIZE_HI_OFFSET_COHORT  (SKC_CMD_RASTERIZE_BITS_TRANSFORM + SKC_CMD_RASTERIZE_BITS_CLIP)
-#define SKC_CMD_RASTERIZE_MASK_COHORT(c)    ((c).u32v2.hi & SKC_BITS_TO_MASK_AT(SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT))
-
-#define SKC_CMD_RASTERIZE_GET_TRANSFORM(c)  ((c).u32v2.hi & SKC_BITS_TO_MASK(SKC_CMD_RASTERIZE_BITS_TRANSFORM))
-#define SKC_CMD_RASTERIZE_GET_CLIP(c)       SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_CLIP,SKC_CMD_RASTERIZE_BITS_TRANSFORM)
-#define SKC_CMD_RASTERIZE_GET_COHORT(c)     ((c).u32v2.hi >> SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)
-// SKC_BFE((c).tcc,SKC_CMD_RASTERIZE_BITS_COHORT,SKC_CMD_RASTERIZE_HI_OFFSET_COHORT)
-
-//
-//
-//
-
-#define SKC_TTSK_SIZE_COHORT                (1 << SKC_CMD_RASTERIZE_BITS_COHORT)
-
-//
-// COHORT META DATA
-//
-
-union skc_raster_cohort_meta_in
-{
-  skc_uint4  u32v4;
-
-  struct {
-    skc_uint blocks; // # of rk blocks
-    skc_uint offset; // start of rk span
-    skc_uint pk;     // # of pk keys
-    skc_uint rk;     // # of rk keys
-  };
-};
-
-union skc_raster_cohort_meta_out
-{
-  skc_uint4  u32v4;
-
-  struct {
-    skc_uint blocks; // # of blocks in raster -- initially just rk blocks
-    skc_uint offset; // start of rk span
-    skc_uint nodes;  // # of nodes in raster  -- necessary for walking
-    skc_uint keys;   // # of rk & pk keys     -- initially just rk
-  };
-};
-
-union skc_raster_cohort_meta_inout
-{
-  union skc_raster_cohort_meta_in  in;
-  union skc_raster_cohort_meta_out out;
-};
-
-//
-// followed by one word for the offset
-//
-
-struct skc_raster_cohort_meta
-{
-  union skc_raster_cohort_meta_inout inout[SKC_TTSK_SIZE_COHORT];
-  skc_uint                           reads[SKC_TTSK_SIZE_COHORT]; // starting ring reads  -- [0] is raster head
-};
-
-#define SKC_RASTER_COHORT_META_OFFSET_READS (SKC_OFFSET_OF(struct skc_raster_cohort_meta,reads) / sizeof(skc_uint))
-
-//
-// COHORT ATOMICS
-//
-
-struct skc_raster_cohort_atomic
-{
-  // rasterization input
-  skc_uint cmds;
-
-  // rasterization output
-  skc_uint keys;
-
-  // block pool base -- idea here is to perform one atomic allocation
-  // skc_uint bp_base;
-};
-
-#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS      0
-#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS      1
-
-#define SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,cmds) / sizeof(skc_uint))
-#define SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC (SKC_OFFSET_OF(struct skc_raster_cohort_atomic,keys) / sizeof(skc_uint))
-
-SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS == SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS_CALC); // verify
-SKC_STATIC_ASSERT(SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS == SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS_CALC); // verify
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/rasterize.cl b/src/compute/skc/rasterize.cl
deleted file mode 100644
index c9462ecff5..0000000000
--- a/src/compute/skc/rasterize.cl
+++ /dev/null
@@ -1,3367 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "raster_builder_cl_12.h"
-#include "block_pool_cl.h"
-
-#include "atomic_cl.h"
-#include "common.h"
-#include "tile.h"
-
-// #define SKC_ARCH_AVX2
-// #define SKC_RASTERIZE_SIMD_USES_SMEM
-
-#define PRINTF_ENABLE       0
-#define PRINTF_BLOCK_COUNT  0
-
-//
-// NOTE:
-//
-// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
-// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
-//
-// NOTE:
-//
-// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
-//
-//
-
-#if 0 // SKC_ARCH_AVX2
-
-// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
-// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
-// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
-
-// #define SKC_TTXB_WORDS                           8
-
-// #define SKC_RASTERIZE_FLOAT                      float8
-// #define SKC_RASTERIZE_UINT                       uint8
-// #define SKC_RASTERIZE_INT                        int8
-// #define SKC_RASTERIZE_PREDICATE                  int8
-
-// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
-// #define SKC_RASTERIZE_BIN                        uint8
-
-// #define SKC_RASTERIZE_POOL                       uint8
-// #define SKC_RASTERIZE_POOL_SCALE                 6
-
-// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
-// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
-
-// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
-
-#endif
-
-//
-// SIMT
-//
-
-#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
-#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
-#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
-
-//
-//
-//
-
-#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
-#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
-#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-//    t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-//    a + (b - a) * t
-//
-// But this may be a native instruction on some devices. For example,
-// on GEN9 there is an LRP "linear interoplation" opcode but it
-// doesn't appear to support half floats.
-//
-// Feel free to toggle this option and then benchmark and inspect the
-// generated code.  We really want the double FMA to be generated when
-// there isn't support for a LERP/MIX operation.
-//
-
-#if 1
-#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t)      mix(a,b,t)
-#endif
-
-//
-// There is no integer MAD in OpenCL with "don't care" overflow
-// semantics.
-//
-// FIXME -- verify if the platform needs explicit MAD operations even
-// if a "--fastmath" option is available at compile time.  It might
-// make sense to explicitly use MAD calls if the platform requires it.
-//
-
-#if 1
-#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
-#else
-#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
-#endif
-
-//
-//
-//
-
-#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
-
-//
-//
-//
-
-union skc_bp_elem
-{
-  skc_uint              u32;
-  skc_tagged_block_id_t tag_id;
-  skc_float             coord;
-};
-
-//
-//
-//
-
-struct skc_subgroup_smem
-{
-  //
-  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
-  //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
-  struct {
-    union {
-
-      skc_uint                winner;
-
-      struct {
-        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
-      } aN;
-
-      struct {
-        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
-      } vN;
-    };
-  } subgroup;
-#endif
-
-  //
-  // work-in-progress TTSB blocks and associated YX keys
-  //
-  union {
-    struct {
-      // FIXME -- some typedefs are valid here
-      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
-      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-    } aN;
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-    struct {
-      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      SKC_RASTERIZE_BIN       yx;
-      SKC_RASTERIZE_BIN       id;
-      SKC_RASTERIZE_BIN       count;
-    } vN;
-#endif
-  } bin;
-};
-
-//
-//
-//
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-#define skc_subgroup_lane()  0
-#else
-#define skc_subgroup_lane()  get_sub_group_local_id()
-#endif
-
-//
-// replenish block ids
-//
-// note that you can't overrun the block id pool since it's a ring
-//
-
-static
-void
-skc_blocks_replenish(skc_uint                           * const blocks_next,
-                     skc_block_id_v_t                   * const blocks,
-                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                     __global skc_block_id_t   const    * const bp_ids)
-{
-  //
-  // get a new vector of block ids -- this is kind of a narrow
-  // allocation but subblocks help stretch out the pool.
-  //
-  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
-  //
-  skc_uint bp_idx = 0;
-
-  if (skc_subgroup_lane() == 0)
-    {
-      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
-                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
-#if 0
-      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
-#endif
-    }
-
-  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
-  *blocks      = bp_ids[bp_idx];
-  *blocks_next = 0;
-}
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_blocks_get_next(skc_uint                           * const blocks_next,
-                    skc_block_id_v_t                   * const blocks,
-                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                    __global skc_block_id_t   const    * const bp_ids)
-{
-  // replenish?
-  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
-    {
-      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
-    }
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
-
-#else
-  //
-  // SIMD
-  //
-  skc_block_id_t id = blocks->s0;
-
-  skc_shuffle_down_1(*blocks);
-
-#endif
-
-  *blocks_next += 1;
-
-  return id;
-}
-
-//
-// subblock allocator
-//
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-
-static
-skc_block_id_t
-skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
-                       skc_uint                           * const blocks_next,
-                       skc_block_id_v_t                   * const blocks,
-                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                       __global skc_block_id_t   const    * const bp_ids)
-{
-  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-    {
-      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
-    }
-
-  skc_block_id_t const sb_id = *subblocks;
-
-  *subblocks += 1;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("= %u\n",sb_id);
-#endif
-
-  return sb_id;
-}
-
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
-
-#else
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
-
-#endif
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
-                  skc_uint                           * const blocks_next,
-                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                  __global skc_block_id_t   const    * const bp_ids,
-                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
-                  skc_ttsk_v_t                       * const sk_v,
-                  skc_uint                           * const sk_v_next,
-                  __global skc_ttsk_s_t              * const sk_extent,
-                  skc_uint                             const new_yx)
-{
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
-                                                       blocks_next,
-                                                       blocks,
-                                                       bp_atomics,
-                                                       bp_mask,
-                                                       bp_ids);
-#else
-  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
-                                                    blocks,
-                                                    bp_atomics,
-                                                    bp_mask, // pow2 modulo mask for block pool ring
-                                                    bp_ids);
-#endif
-
-  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
-    {
-      sk_v->lo = new_id;
-      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
-#if 0
-      printf("@ ( %3u, %3u ) %u\n",
-             (new_yx >> 12) & 0xFFF,
-             (new_yx      ) & 0xFFF,
-             new_id);
-#endif
-    }
-
-  *sk_v_next += 1;
-
-  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
-    {
-      *sk_v_next = 0;
-
-      skc_uint sk_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
-#if 0
-          printf("+ %u\n",sk_idx);
-#endif
-        }
-
-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
-      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
-#endif
-        {
-          sk_extent[sk_idx] = *sk_v;
-#if 0
-          printf("> %u : %v2u\n",sk_idx,*sk_v);
-#endif
-        }
-    }
-
-  return new_id;
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  //  0 +
-  // --
-  // 01
-  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  //  012 +
-  // ----
-  // 0123
-  //   01 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  //  0123456 +
-  // --------
-  // 01234567
-  //   012345 +
-  // --------
-  // 01234567
-  //     0123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  //  0123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  //   0123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  //     0123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  //         01234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
-  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  //  0 +
-  // --
-  // 01
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  //  012 +
-  // ----
-  // 0123
-  //   01 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  //  0123456 +
-  // --------
-  // 01234567
-  //   012345 +
-  // --------
-  // 01234567
-  //     0123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  //  0123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  //   0123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  //     0123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  //         01234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
-  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  // 00 max
-  // --
-  // 01
-  SKC_RASTERIZE_UINT const w = max(v.s00,v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  // 0012 +
-  // ----
-  // 0123
-  // 0101 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
-  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  // 00123456 +
-  // --------
-  // 01234567
-  // 01012345 +
-  // --------
-  // 01234567
-  // 01230123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
-  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
-  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  // 00123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  // 010123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  // 01230123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  // 0123456701234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
-  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
-  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
-  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_max(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return v.sf;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return v.sf;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#else
-  return v.s0;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,0);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
-                      SKC_RASTERIZE_UINT  const i)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#else
-  return shuffle(v,i);
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return intel_sub_group_shuffle(v,i);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
-                          SKC_RASTERIZE_FLOAT const c) // current
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // FIXME -- there are alternative formulations here:
-  //
-  // Option 1:
-  //
-  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
-  //
-  // Option 2:
-  //
-  //   p is a scalar
-  //   t    = c.rotate(+1)
-  //   t.s0 = p;
-  //
-  // Option 3: ...
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return p;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return shuffle2(p,c,(uint2)(1,2));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return shuffle2(p,c,(uint4)(3,4,5,6));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return intel_sub_group_shuffle_up(p,c,1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_is_lane_first()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
-  //
-  // SIMD
-  //
-  return true;
-#else
-  //
-  // SIMT
-  //
-  return get_sub_group_local_id() == 0;
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_delta_offset()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return 1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return 1.0f + get_sub_group_local_id();
-
-#endif
-
-}
-
-//
-//
-//
-
-static
-int
-skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  return any(p);
-#else
-  //
-  // SIMT
-  //
-  return sub_group_any(p);
-#endif
-}
-
-//
-//
-//
-
-#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
-
-void
-skc_segment_next(__global union skc_bp_elem * const bp_elems,
-                 skc_uint                   * const nodeword,
-                 skc_block_id_t             * const id)
-{
-  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-    {
-      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
-        {
-          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
-        }
-
-      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
-
-      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-    }
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
-{
-  return native_sqrt(x * x + y * y);
-}
-
-//
-// Wang's Formula (1985)
-//
-
-#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
-
-#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
-
-#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
-#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
-
-#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
-#define SKC_WANG_SQRT(x)      native_sqrt(x)
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
-                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
-                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
-                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
-{
-  //
-  // Return the number of evenly spaced (in the parametric sense) line
-  // segments that are guaranteed to be within "epsilon" error of the
-  // curve.
-  //
-  // We're then going to take multiples of the reciprocal of this
-  // number so that the segmentation can be distributed across the
-  // subgroup.
-  //
-  // Note, this can probably be slightly optimized per architecture
-  // but it's probably far from being a hotspot since it's all
-  // straight-line unpredicated code.
-  //
-  // The result is an integer ranging from [1.0,#segments]
-  //
-  // Note that even if all of the control points are coincident, the
-  // max(1.0f) will categorize this as a line of 1 segment.
-  //
-  // This is what we want!  We want to convert cubics to lines as
-  // easily as possible and *then* cull lines that are either
-  // horizontal or zero length.
-  //
-  return max(1.0f,
-             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
-                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
-                                                    fabs(t3x - 2.0f * t2x + t1x)),
-                                                max(fabs(t2y - 2.0f * t1y + t0y),
-                                                    fabs(t3y - 2.0f * t2y + t1y))))));
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
-                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
-                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
-{
-  return max(1.0f,
-             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
-                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
-                                                fabs(t2y - 2.0f * t1y + t0y)))));
-}
-
-//
-// rational curves
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic_rat()
-{
-  return 0.0f;
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quad_rat()
-{
-  return 0.0f;
-}
-
-//
-// flush any work-in-progress blocks and return unused block ids
-//
-
-static
-void
-skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-             __global union skc_bp_elem                 * const bp_elems,
-             __global uint                              * const bp_ids,
-             skc_uint                                     const bp_mask,
-             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-             skc_block_id_v_t                           * const blocks,
-             skc_uint                                     const blocks_next,
-             skc_ttsk_v_t                               * const sk_v,
-             skc_uint                                     const sk_v_next,
-             __global skc_ttsk_s_t                      * const sk_extent,
-             __local  struct skc_subgroup_smem volatile * const smem)
-{
-  //
-  // flush non-empty bins
-  //
-  // FIXME -- accelerate this iteration/search with a subgroup operation
-  //
-  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
-    {
-      if (smem->bin.aN.count[ii] > 0)
-        {
-          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
-          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
-#if 0
-          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
-#endif
-          bp_elems[idx].u32 = tts;
-        }
-
-      //
-      // FIXME -- vectorize with vstoreN()
-      //
-    }
-
-  //
-  // return remaining block ids back to the pool
-  //
-  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
-
-  if (blocks_rem > 0)
-    {
-      skc_uint bp_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
-
-#if 0
-          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
-#endif
-        }
-
-      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
-
-      if (skc_subgroup_lane() >= blocks_next)
-        {
-          bp_ids[bp_idx] = *blocks;
-        }
-    }
-
-  //
-  // flush work-in-progress ryx keys
-  //
-  if (sk_v_next > 0)
-    {
-      skc_uint sk_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
-#if 0
-          printf("* %u\n",sk_idx);
-#endif
-        }
-
-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
-      if (skc_subgroup_lane() < sk_v_next)
-        {
-          sk_extent[sk_idx] = *sk_v;
-        }
-    }
-}
-
-//
-// If there are lanes that were unable to append to a bin because
-// their hashes collided with a bin's current ryx key then those bins
-// must be ejected.
-//
-// Note that we do not eject "full" bins because lazily waiting for a
-// collision results in simpler code.
-//
-
-static
-void
-skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-          __global union skc_bp_elem                 * const bp_elems,
-          __global uint                              * const bp_ids,
-          skc_uint                                     const bp_mask,
-          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-          skc_block_id_t                             * const subblocks,
-          skc_block_id_v_t                           * const blocks,
-          skc_uint                                   * const blocks_next,
-          skc_ttsk_v_t                               * const sk_v,
-          skc_uint                                   * const sk_v_next,
-          __global skc_ttsk_s_t                      * const sk_extent,
-          __local  struct skc_subgroup_smem volatile * const smem,
-          SKC_RASTERIZE_UINT                           const hash,
-          SKC_RASTERIZE_UINT                           const yx,
-          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-
-  //
-  // FIXME -- this code is now stale with the changes to the
-  // subblock/block allocation strategy
-  //
-
-  //
-  // get local TTSB ID queue count
-  //
-  skc_uint ttsb_id_count  = smem->pool.count; // scalar
-
-  // init hash bit mask
-  skc_uint component_mask = 0;
-
-  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
-    {
-      // if no collision continue
-      if (((int*)&is_collision)[cc] == 0)
-        continue;
-
-      uint const winner        = ((uint*)&hash)[cc];
-      uint const component_bit = 1u << winner;
-
-      // if already processed this hash then continue
-      if (component_mask & component_bit)
-        continue;
-
-      // update component mask
-      component_mask |= component_bit;
-
-      //
-      // new winner requires ejecting the old TTSB
-      //
-      if (smem->bin.aN.count[winner] > 0)
-        {
-          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
-          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
-        }
-
-        //
-        // ensure there is at least one TTSK and TTSB ID
-        //
-        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
-          {
-            //
-            // update remaining count
-            //
-            ttsb_id_count = 0;
-
-            //
-            // flush accumulated ttsk_ryx keys
-            //
-            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
-
-#if 0
-            printf("# %u\n",idx);
-#endif
-
-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
-              {
-                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
-              }
-
-            //
-            // allocate more ttsb ids from pool
-            //
-            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
-
-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
-              smem->pool.aN.id[ii] = bp_ids[id + ii];
-          }
-
-      //
-      // invalidate the winning block
-      //
-
-      //
-      // update bin with winning yx, new ttsb id and zero count
-      //
-      // all lanes are loading/storing from/to the same index
-      //
-      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
-      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
-      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
-      smem->bin.aN.count[winner] = 0;
-
-      //
-      // update count
-      //
-      ttsb_id_count += 1;
-    }
-
-  //
-  // save count
-  //
-  smem->pool.count = ttsb_id_count;
-
-#else
-  //
-  // SIMT
-  //
-
-  do {
-    //
-    // only one lane will win!
-    //
-    if (is_collision)
-      smem->subgroup.winner = hash;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //
-    // which bin is being ejected?
-    //
-    skc_uint const winner = smem->subgroup.winner;
-
-    //
-    // which colliding hash is taking over the bin?
-    //
-    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
-
-    //
-    // all lanes with the same hash will try to store but only one
-    // lane will win
-    //
-    if (is_winner)
-      smem->subgroup.winner = yx;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //
-    // flush this block to the pool
-    //
-    if (smem->bin.aN.count[winner] > 0)
-      {
-        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
-        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
-#if 0
-        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
-#endif
-        bp_elems[idx].u32 = tts;
-      }
-
-    //
-    // append new ttsk
-    //
-    skc_uint       const new_yx = smem->subgroup.winner;
-    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
-                                                    blocks_next,
-                                                    bp_atomics,
-                                                    bp_mask, // pow2 modulo mask for block pool ring
-                                                    bp_ids,
-                                                    cohort_atomics,
-                                                    sk_v,
-                                                    sk_v_next,
-                                                    sk_extent,
-                                                    new_yx);
-
-#if 0
-    if (get_sub_group_local_id() == 0) {
-      printf(">>> %9u\n",new_id);
-    }
-#endif
-
-    //
-    // update bin with winning yx, new ttsb id and zero count
-    //
-    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
-    smem->bin.aN.yx   [winner]                      = new_yx;
-    smem->bin.aN.id   [winner]                      = new_id;
-    smem->bin.aN.count[winner]                      = 0;
-
-    //
-    // remove all lanes matching this hash
-    //
-    is_collision = is_collision && !is_winner;
-
-    //
-    // exit if nothing left to do
-    //
-  } while (sub_group_any(is_collision));
-
-#endif
-}
-
-//
-// scatter scan max
-//
-static
-SKC_RASTERIZE_UINT
-skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
-                     SKC_RASTERIZE_FLOAT                         const iss,
-                     SKC_RASTERIZE_FLOAT                         const ess)
-{
-  //
-  // prefix sums determine which lanes we're going to work on next
-  //
-  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
-  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
-  //
-  // SIMD APPROACH 1: SIMT'ISH
-  //
-
-  // zero the volatile smem scratchpad using vector syntax
-  smem->subgroup.vN.scratch[0] = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                         \
-  if (is_scratch_store C)                               \
-    smem->subgroup.aN.scratch[scratch_idx C] = I;
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-  // propagate lanes to right using max scan
-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
-
-#else
-  //
-  // SIMD APPROACH 2: SCALAR'ISH
-  //
-
-  SKC_RASTERIZE_UINT source = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                 \
-  if (is_scratch_store C)                       \
-    ((uint *)&source)[scratch_idx C] = I;
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
-    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  //
-  // zero the volatile smem scratchpad using vector syntax
-  //
-  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
-
-  //
-  // store source lane at starting lane
-  //
-  if (is_scratch_store)
-    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
-
-  //
-  // propagate lanes to right using max scan
-  //
-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
-#endif
-
-  return source;
-}
-
-//
-// sliver lines into subpixels
-//
-
-static
-void
-skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-           __global union skc_bp_elem                 * const bp_elems,
-           __global uint                              * const bp_ids,
-           skc_uint                                     const bp_mask,
-           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-           skc_block_id_t                             * const subblocks,
-           skc_block_id_v_t                           * const blocks,
-           skc_uint                                   * const blocks_next,
-           skc_ttsk_v_t                               * const sk_v,
-           skc_uint                                   * const sk_v_next,
-           __global skc_ttsk_s_t                      * const sk_extent,
-           __local  struct skc_subgroup_smem volatile * const smem,
-           SKC_RASTERIZE_FLOAT                          const l0x,
-           SKC_RASTERIZE_FLOAT                          const l0y,
-           SKC_RASTERIZE_FLOAT                          const l1x,
-           SKC_RASTERIZE_FLOAT                          const l1y)
-{
-  //
-  // Y-SLIVERING
-  // -----------
-  //
-  // immediately sliver all multi-pixel lines in into 1-pixel high
-  // lines
-  //
-  // note this implicitly squelches horizontal lines
-  //
-  // there is another test for horizontal lines after x-slivering
-  // is complete
-  //
-
-  //
-  // will we need to flip the sign of y_delta ?
-  //
-  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
-  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
-
-  //
-  // save 1/dy
-  //
-  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
-
-  //
-  // how many non-horizontal subpixel y-axis slivers are there?
-  //
-  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
-  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
-  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
-  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
-
-  //
-  // inclusive subgroup scan of y_segs
-  //
-  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
-  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
-  float                     y_rem   = skc_subgroup_last_float(y_iss);
-
-  //
-  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
-  //
-  if (y_segs == 0.0f)
-    y_iss = 0.0f;
-
-#if 0
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
-#endif
-
-  //
-  // these values don't matter on first iteration
-  //
-  SKC_RASTERIZE_FLOAT n1x_prev = 0;
-  SKC_RASTERIZE_FLOAT n1y_prev = 0;
-
-  //
-  // loop until done
-  //
-  while (y_rem > 0.0f)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
-
-      //
-      // get line at y_source line
-      //
-      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
-      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
-      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
-      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
-
-      //
-      // every lane will create a 1 pixel tall line "sliver"
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if numerator == 1 then this is the first lane
-      // if numerator == s then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
-      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
-
-      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
-
-      // toggle y_delta sign
-      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
-
-      //
-      // calculate "right" line segment endpoint
-      //
-      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
-      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
-      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
-
-      //
-      // override c1 if this is last point
-      //
-      n1y = select(n1y,m1y,is_y_last);
-      n1x = select(n1x,m1x,is_y_last);
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
-      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      n0y = select(n0y,m0y,is_y_first);
-      n0x = select(n0x,m0x,is_y_first);
-
-      //
-      // save previous right endpoint
-      //
-      n1x_prev = n1x;
-      n1y_prev = n1y;
-
-      //
-      // decrement by subgroup size
-      //
-      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-#if 0
-      //
-      // debug
-      //
-      if (n0y != n1y) {
-        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
-      }
-#endif
-
-      //
-      // X-SLIVERING
-      // -----------
-      //
-      // now sliver 1-pixel high lines into at either vertical or
-      // 1-pixel wide lines
-      //
-      // save original direction and work with increasing x
-      //
-      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
-      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
-
-      //
-      // save 1/dy
-      //
-      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
-
-      //
-      // how many non-horizontal subpixel y-axis slivers are there?
-      //
-      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
-      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
-      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
-      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
-
-      //
-      // inclusive subgroup scan of y_segs
-      //
-      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
-      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
-      float                     x_rem    = skc_subgroup_last_float(x_iss);
-
-      //
-      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
-      //
-      //if (x_segs == 0.0f)
-      // x_iss = 0.0f;
-
-      //
-      // these values don't matter on first iteration
-      //
-      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
-      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
-
-      //
-      // loop until done
-      //
-      while (x_rem > 0)
-        {
-          //
-          // distribute work across lanes
-          //
-          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
-
-          //
-          // get line at y_source line
-          //
-          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
-          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
-          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
-          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
-
-          //
-          // every lane will create a 1 pixel tall line "sliver"
-          //
-          // FIXME -- this gets expanded on SIMD
-          //
-          // if numerator == 1 then this is the first lane
-          // if numerator == s then this is the last  lane
-          //
-          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
-          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
-
-          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
-          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
-
-          // toggle x_delta sign
-          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
-
-          //
-          // calculate "right" line segment endpoint
-          //
-          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
-          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
-          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
-
-          //
-          // override c1 if this is last point
-          //
-          p1x = select(p1x,o1x,is_x_last);
-          p1y = select(p1y,o1y,is_x_last);
-
-          //
-          // shuffle up "left" line segment endpoint
-          //
-          // NOTE: Intel's shuffle_up is unique with its elegant
-          // "previous" argument so don't get used to it
-          //
-          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
-          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
-
-          //
-          // override shuffle up if this is the first line segment
-          //
-          p0x = select(p0x,o0x,is_x_first);
-          p0y = select(p0y,o0y,is_x_first);
-
-          //
-          // save previous right endpoint
-          //
-          p1x_prev = p1x;
-          p1y_prev = p1y;
-
-          //
-          // decrement by subgroup size
-          //
-          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-          //
-          // only non-horizontal subpixel lines are valid
-          //
-          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
-
-          //
-          // if no lanes are active then continue
-          //
-          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
-          // IMPACTS PERFORMANCE (+12% ?)
-          //
-          // IT SHOULDN'T !!!
-          //
-#if 0
-          if (!skc_subgroup_any(is_active))
-            continue;
-#endif
-
-          //
-          // Option 1: use SLM for explicitly managed coalesced stores
-          //
-          // 1. which tile does this line belong?
-          // 2. hash tile coordinates
-          // 3. lookup hash
-          // 4. if tile matches then SLM append keys
-          // 5. if tile doesn't match
-          //   a. flush
-          //   b. create new TTSK_RYX
-          //   c. obtain TTSB block from pool
-          //   d. goto 3.
-          //
-
-          //
-          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
-          //
-          // 1. which tile does this line belong?
-          // 2. hash tile coordinates
-          // 3. lookup hash
-          // 4. if tile matches then GMEM append keys
-          // 5. if tile doesn't match
-          //   a. flush (and invalidate empty elems)
-          //   b. create new TTSK_RYX
-          //   c. obtain TTSB block from pool
-          //   d. goto 3.
-          //
-
-          //
-          // The virtual rasterization surface is very large and
-          // signed: +/- ~64K-256K, depending on the architecture.
-          //
-          // Rasters must be clipped to the virtual surface and,
-          // optionally, clipped even further on a per raster
-          // basis.
-          //
-
-          //
-          // Clip to the per-raster clip
-          //
-
-          /*
-
-            CLIP HERE
-
-          */
-
-          //
-          // Hash the tile coordinates
-          //
-          // This table lists nominal values for each architecture.
-          // We want to choose values that are naturally fit the
-          // "width" of the architecture.
-          //
-          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
-          //   ----  -------  ----  ---------  --------  ---------
-          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
-          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
-          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
-          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
-          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
-          //
-          // NOTE: When possible, bias the hash toward using more y
-          // bits because of:
-          //
-          //   1. the 90 degree counter-clockwise rotation that we put
-          //      in place to offset the render-time clockwise
-          //      rotation
-          //
-          //   2. the likely presence of left-to-right or
-          //      right-to-left glyphs.
-          //
-          // For power-of-two bins, the hash is easy.
-          //
-          // For non-power-of-two, we may want to either implement a
-          // fast mod (compiler should do this for us... hahahaha) or
-          // drop down to the next power-of-two.
-          //
-
-          //
-          // FIXME -- this snarl is not good -- can probably reduce
-          // some of the sign casting but some is there to vectorize a
-          // scalar
-          //
-          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
-          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
-
-          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
-          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
-
-          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
-          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
-
-          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
-
-          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
-          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
-
-          //
-          // map [+1,+32] to [ 0,+31]
-          // map [-1,-32] to [-1,-32]
-          //
-          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
-
-          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
-          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
-          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
-
-          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
-          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
-
-          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
-
-          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
-                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
-
-          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
-
-#if 0
-          printf("(%3u, %3u)\n",tile_y,tile_x);
-#endif
-
-#if 0
-          if (is_active)
-            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
-#endif
-
-          //
-          // debug
-          //
-#if 0 // PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-          if (is_active C)                                              \
-            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
-
-          SKC_RASTERIZE_VECTOR_EXPAND();
-#else
-          if (is_active)
-            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
-#endif
-
-#endif
-          //
-          // flush all active lanes
-          //
-          while (true)
-            {
-              //
-              // either gather load or vector load+shuffle the yx keys
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
-              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
-#else
-              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
-#endif
-
-              //
-              // does yx for lane match yx for hash?
-              //
-              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
-              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
-
-              //
-              // OpenCL spec: "When casting a bool to a vector integer
-              // data type, the vector components will be set to -1
-              // (i.e. all bits set) if the vector bool value is true
-              // and 0 otherwise.
-              //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
-#else
-              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
-#endif
-              //
-              // how many new elements for each matching hash bin?
-              //
-              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
-              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
-
-              //
-              // prefix sum all of the bins in parallel
-              //
-              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
-              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
-
-              //
-              // current bin counts
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
-              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
-#else
-              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
-#endif
-
-              //
-              // calculate where each cache-hit and in-bounds tts should be stored
-              //
-              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
-              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
-
-              //
-              // which lanes can append to a matching bin?
-              //
-              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
-
-              //
-              // scatter append tts elements to bin blocks
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
-              //
-              // SIMD
-              //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-              if (is_append C)                                          \
-                {                                                       \
-                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
-                  smem->bin.aN.count[hash C]               = count_new C; \
-                }
-
-              SKC_RASTERIZE_VECTOR_EXPAND();
-#else
-              //
-              // SIMT
-              //
-              if (is_append)
-                {
-                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
-                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
-                }
-#endif
-              //
-              // try to keep predicate updates SIMD-friendly and
-              // outside of predicated code paths -- this is not
-              // always how we would normally do things on SIMT but
-              // either approach is acceptable
-              //
-
-              //
-              // mask off lanes/components that successfully appended
-              //
-              is_active = is_active && !is_append;
-
-              //
-              // are there any active lanes left?
-              //
-              if (!skc_subgroup_any(is_active))
-                break;
-
-              //
-              // There are active lanes that couldn't be appended to a
-              // bin because their hashes collided with the bin's
-              // current ryx key then those bins must be ejected.
-              //
-              // Note that we do not eject "full" bins because lazily
-              // waiting for a collision results in simpler code.
-              //
-              skc_flush(bp_atomics,
-                        bp_elems,
-                        bp_ids,
-                        bp_mask,
-                        cohort_atomics,
-                        subblocks,
-                        blocks,
-                        blocks_next,
-                        sk_v,
-                        sk_v_next,
-                        sk_extent,
-                        smem,
-                        hash,
-                        yx,
-                        is_active);
-            }
-        }
-    }
-}
-
-//
-// INITIALIZE SMEM
-//
-// Note that SIMD/SIMT have nearly the same syntax.
-//
-static
-void
-skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
-{
-  //
-  // initialize smem bins
-  //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
-  smem->bin.vN.count = ( 0 );
-#else
-  //
-  // SIMT
-  //
-  int idx = skc_subgroup_lane();
-
-#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
-  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
-#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
-  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
-#endif
-    {
-      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
-      smem->bin.aN.count[idx] = ( 0 );
-    }
-#endif
-}
-
-//
-// RASTERIZE CUBIC KERNEL
-//
-
-static
-void
-skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                     __global union skc_bp_elem                * const bp_elems,
-                     __global uint                             * const bp_ids,
-                     skc_uint                                    const bp_mask,
-
-                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                     __global skc_ttsk_s_t                     * const sk_extent,
-
-                     __local struct skc_subgroup_smem volatile * const smem,
-
-                     skc_uint                                  * const nodeword,
-                     skc_block_id_t                            * const id,
-
-                     union skc_transform              const    * const tv,
-                     union skc_path_clip              const    * const cv,
-                     skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only support perspective later
-  //
-  // the affine transformation requires 8 FMA + 2 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);
-
-  //
-  //
-  //
-#if PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
-         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
-         b0x C,b0y C,t1x C,t1y C,                                       \
-         t2x C,t2y C,t3x C,t3y C);
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-#else
-
-  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
-         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
-#endif
-
-#endif
-
-  //
-  // OLD APPROACH
-  // ------------
-  //
-  // The Spinel CUDA rasterizer was significantly more complex and
-  // performed a few different tasks that are probably best kept
-  // separate.
-  //
-  // The Spinel rasterizer Bezier held 4-element x and y coordinates
-  // in adjacent lanes. This simplified intermingling of single lane
-  // 4-coordinate line segments with two-lane cubic Beziers.
-  //
-  // After transformation of the input segments, the Spinel rasterizer
-  // would test cubics for flatness and, if flat, collapse the
-  // adjacent lanes into a single line lane and an empty lane.
-  //
-  // Any lines would then be appended to a line queue.
-  //
-  // Any cubics would then be subdivided.
-  //
-  // The reclassification process would be repeated.
-  //
-  // NEW APPROACH
-  // ------------
-  //
-  // Assume we're only working with cubics in this kernel.
-  //
-  // Optimization: if the line segment is a special case -- a cusp,
-  // has 1+ inflections, or a loop -- it might be beneficial to
-  // subdivide the control cage 1+ times in order to separate the
-  // flatter segments the high-velocity region(s).
-  //
-  // This means we want to split using [a,b] formulation to _directly_
-  // subdivide producing a new control cage.
-  //
-  // Wang's Formula is still useful even if we subdivide once or twice
-  // as it's so cheap that it might give some useful hints about where
-  // the high-velocity sections of curve reside.
-  //
-  // But it seems like using Wang's and directly flattening to line
-  // segments without any subdivision is good enough for the limited
-  // set of test cases that I've tried.
-  //
-  // So... use Wang's Formula to estimate how many line segment are
-  // required to properly flatten the cubics.
-  //
-  // Then use inclusive/exclusive scans to put all the lanes to work:
-  //
-  //   1. segmenting cubics to line segments
-  //
-  //   2. slivering line segments into 1-pixel high line segments
-  //
-  //   3. slivering 1-pixel high line segments into 1-pixel wide line
-  //      segments
-  //
-  // MORE BACKGROUND ON NEW APPROACH
-  // -------------------------------
-  //
-  // Two options for handling line segments:
-  //
-  // 1. append the line segments onto an SLM array until enough
-  //    work has been accrued (Spinel does this)
-  //
-  // 2. immediately sliver the potentially multi-pixel line
-  //    segments into subpixel lines
-  //
-  // The advantage of (1) is that it guarantees the slivering
-  // process will, on average, always be emitting a full subgroup
-  // of subpixel lines.
-  //
-  // The advantage of (2) is that it reduces code complexity and
-  // leaves more room for SLM tile bins. The difference between Spinel
-  // and Skia Compute is that Wang's Formula guarantees there will be
-  // a full subgroup of multi-pixel lines unless this is the final
-  // iteration of the warp of multi-pixel lines.
-  //
-  // Note that wider GPU architectures might benefit from (1) and
-  // other work accumulation strategies because it will minimize
-  // partial warp workloads in the final iteration of each stage.  It
-  // also minimizes the sunk cost of the uniform control logic steps.
-  //
-  // So let's implement (2) for now...
-  //
-
-  //
-  // And... begin!
-  //
-  // Estimate how many line segments are in quad/cubic curve.
-  //
-  // Wang's Formula will return zero if the control points are
-  // collinear but we bump it up to 1.0f.
-  //
-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
-  //
-  // if there are free registers then precalculate the reciprocal for
-  // each estimated segments since it will never change
-  //
-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
-  //
-  // inclusive add scan of estimated line segments
-  // exclusive add scan of estimated line segments
-  // total number       of estimated line segments
-  //
-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
-
-  //
-  // Precompute cubic polynomial coefficients from transformed control
-  // cage so we can shuffle them in on each iteration of the outer
-  // loop and then evaluate the polynomial in Horner form.
-  //
-  //                            |  1  0  0  0 | | c0 |
-  //                            |             | |    |
-  //                            | -3  3  0  0 | | c1 |
-  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
-  //                            |  3 -6  3  0 | | c2 |
-  //                            |             | |    |
-  //                            | -1  3 -3  1 | | c3 |
-  //
-  //
-  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
-  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
-
-  //
-  // these values don't matter on the first iteration
-  //
-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // loop until done
-  //
-  while (s_rem > 0)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
-      //
-      // every lane has a fraction to work off of
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if delta == 1      then this is the first lane
-      // if count == s_segs then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
-
-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
-
-      //
-      // init parametric t
-      //
-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
-      //
-      // if last then override to a hard 1.0f
-      //
-      s_t    = is_s_last ? 1.0f : s_t;
-
-      //
-      // decrement by subgroup size
-      //
-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-      //
-      // now every lane knows what to do and the following lines will
-      // pump out up to SUBGROUP_SIZE line segments
-      //
-      // obtain the src vertices through shared or via a shuffle
-      //
-
-      //
-      // shuffle in the polynomial coefficients their source lane
-      //
-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
-      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
-
-      //
-      // calculate "right" line segment endpoint using Horner form
-      //
-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
-      //
-      // save previous right endpoint
-      //
-      l1x_prev = l1x;
-      l1y_prev = l1y;
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      l0x = select(l0x,s0x,is_s_first);
-      l0y = select(l0y,s0y,is_s_first);
-
-      //
-      // sliver lines
-      //
-      skc_sliver(bp_atomics,
-                 bp_elems,
-                 bp_ids,
-                 bp_mask,
-                 cohort_atomics,
-                 &subblocks,
-                 &blocks,
-                 &blocks_next,
-                 &sk_v,
-                 &sk_v_next,
-                 sk_extent,
-                 smem,
-                 l0x,l0y,l1x,l1y);
-    }
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-// RASTERIZE QUAD KERNEL
-//
-
-static
-void
-skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                    __global union skc_bp_elem                * const bp_elems,
-                    __global uint                             * const bp_ids,
-                    skc_uint                                    const bp_mask,
-
-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                    __global skc_ttsk_s_t                     * const sk_extent,
-
-                    __local struct skc_subgroup_smem volatile * const smem,
-                    
-                    skc_uint                                  * const nodeword,
-                    skc_block_id_t                            * const id,
-
-                    union skc_transform              const    * const tv,
-                    union skc_path_clip              const    * const cv,
-                    skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only support perspective later
-  //
-  // the affine transformation requires 8 FMA + 2 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);
-
-  //
-  // Estimate how many line segments are in quad/cubic curve.
-  //
-  // Wang's Formula will return zero if the control points are
-  // collinear but we bump it up to 1.0f.
-  //
-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
-
-  //
-  // if there are free registers then precalculate the reciprocal for
-  // each estimated segments since it will never change
-  //
-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
-  //
-  // inclusive add scan of estimated line segments
-  // exclusive add scan of estimated line segments
-  // total number       of estimated line segments
-  //
-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
-
-  //
-  // Precompute quadratic polynomial coefficients from control cage so
-  // we can shuffle them in on each iteration of the outer loop and
-  // then evaluate the polynomial in Horner form.
-  //
-
-  //                        |  1  0  0  | | c0 |
-  //                        |           | |    |
-  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
-  //                        |           | |    |
-  //                        |  1 -2  1  | | c2 |
-  //
-  //
-  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
-  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
-
-  //
-  // these values don't matter on the first iteration
-  //
-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // loop until done
-  //
-  while (s_rem > 0)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
-      //
-      // every lane has a fraction to work off of
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if delta == 1      then this is the first lane
-      // if count == s_segs then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
-
-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
-
-      //
-      // init parametric t
-      //
-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
-      //
-      // if last then override to a hard 1.0f
-      //
-      s_t    = is_s_last ? 1.0f : s_t;
-
-      //
-      // decrement by subgroup size
-      //
-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-      //
-      // now every lane knows what to do and the following lines will
-      // pump out up to SUBGROUP_SIZE line segments
-      //
-      // obtain the src vertices through shared or via a shuffle
-      //
-
-      //
-      // shuffle in the polynomial coefficients their source lane
-      //
-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
-      //
-      // calculate "right" line segment endpoint using Horner form
-      //
-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
-      //
-      // save previous right endpoint
-      //
-      l1x_prev = l1x;
-      l1y_prev = l1y;
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      l0x = select(l0x,s0x,is_s_first);
-      l0y = select(l0y,s0y,is_s_first);
-
-      //
-      // sliver lines
-      //
-      skc_sliver(bp_atomics,
-                 bp_elems,
-                 bp_ids,
-                 bp_mask,
-                 cohort_atomics,
-                 &subblocks,
-                 &blocks,
-                 &blocks_next,
-                 &sk_v,
-                 &sk_v_next,
-                 sk_extent,
-                 smem,
-                 l0x,l0y,l1x,l1y);
-    }
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-// RASTERIZE LINE KERNEL
-//
-
-static
-void
-skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                    __global union skc_bp_elem                * const bp_elems,
-                    __global uint                             * const bp_ids,
-                    skc_uint                                    const bp_mask,
-
-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                    __global skc_ttsk_s_t                     * const sk_extent,
-
-                    __local struct skc_subgroup_smem volatile * const smem,
-                    
-                    skc_uint                                  * const nodeword,
-                    skc_block_id_t                            * const id,
-
-                    union skc_transform              const    * const tv,
-                    union skc_path_clip              const    * const cv,
-                    skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-#if 0
-  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
-#endif
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only
-  // FIXME -- support perspective later
-  //
-  // the affine transformation requires 8 FMA + 4 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);
-
-#if 0
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
-#endif
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // sliver lines
-  //
-  skc_sliver(bp_atomics,
-             bp_elems,
-             bp_ids,
-             bp_mask,
-             cohort_atomics,
-             &subblocks,
-             &blocks,
-             &blocks_next,
-             &sk_v,
-             &sk_v_next,
-             sk_extent,
-             smem,
-             l0x,l0y,l1x,l1y);
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                         __global union skc_bp_elem                * const bp_elems,
-                         __global uint                             * const bp_ids,
-                         skc_uint                                    const bp_mask,
-
-                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                         __global skc_ttsk_s_t                     * const sk_extent,
-
-                         __global float8                  const    * const transforms, // FIXME -- __constant
-                         __global float4                  const    * const clips,      // FIXME -- __constant
-                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                         skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("+cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("-cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("[ %u ]< %u, %u, %u, %u >\n",
-           cmd_idx,
-           cmd.nodeword,
-           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
-           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
-           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
-#endif
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  switch (tag)
-    {
-    case SKC_BLOCK_ID_TAG_PATH_LINE:
-      skc_rasterize_lines(bp_atomics,
-                          bp_elems,
-                          bp_ids,
-                          bp_mask,
-                          cohort_atomics,
-                          sk_extent,
-                          smem,
-                          &nodeword,&id,
-                          &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_QUAD:
-      skc_rasterize_quads(bp_atomics,
-                          bp_elems,
-                          bp_ids,
-                          bp_mask,
-                          cohort_atomics,
-                          sk_extent,
-                          smem,
-                          &nodeword,&id,
-                          &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
-      skc_rasterize_cubics(bp_atomics,
-                           bp_elems,
-                           bp_ids,
-                           bp_mask,
-                           cohort_atomics,
-                           sk_extent,
-                           smem,
-                           &nodeword,&id,
-                           &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
-      break;
-    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
-      break;
-
-    default:
-      break;
-    }
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                           __global union skc_bp_elem                * const bp_elems,
-                           __global uint                             * const bp_ids,
-                           skc_uint                                    const bp_mask,
-
-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                           __global skc_ttsk_s_t                     * const sk_extent,
-
-                           __global float8                  const    * const transforms, // FIXME -- __constant
-                           __global float4                  const    * const clips,      // FIXME -- __constant
-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                           skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_lines(bp_atomics,
-                      bp_elems,
-                      bp_ids,
-                      bp_mask,
-                      cohort_atomics,
-                      sk_extent,
-                      smem,
-                      &nodeword,&id,
-                      &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                           __global union skc_bp_elem                * const bp_elems,
-                           __global uint                             * const bp_ids,
-                           skc_uint                                    const bp_mask,
-
-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                           __global skc_ttsk_s_t                     * const sk_extent,
-
-                           __global float8                  const    * const transforms, // FIXME -- __constant
-                           __global float4                  const    * const clips,      // FIXME -- __constant
-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                           skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_quads(bp_atomics,
-                      bp_elems,
-                      bp_ids,
-                      bp_mask,
-                      cohort_atomics,
-                      sk_extent,
-                      smem,
-                      &nodeword,&id,
-                      &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                            __global union skc_bp_elem                * const bp_elems,
-                            __global uint                             * const bp_ids,
-                            skc_uint                                    const bp_mask,
-
-                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                            __global skc_ttsk_s_t                     * const sk_extent,
-
-                            __global float8                  const    * const transforms, // FIXME -- __constant
-                            __global float4                  const    * const clips,      // FIXME -- __constant
-                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                            skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_cubics(bp_atomics,
-                       bp_elems,
-                       bp_ids,
-                       bp_mask,
-                       cohort_atomics,
-                       sk_extent,
-                       smem,
-                       &nodeword,&id,
-                       &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                               __global union skc_bp_elem                * const bp_elems,
-                               __global uint                             * const bp_ids,
-                               skc_uint                                    const bp_mask,
-
-                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                               __global skc_ttsk_s_t                     * const sk_extent,
-
-                               __global float8                  const    * const transforms, // FIXME -- __constant
-                               __global float4                  const    * const clips,      // FIXME -- __constant
-                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                               skc_uint                                    const count)
-{
-  ;
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                                __global union skc_bp_elem                * const bp_elems,
-                                __global uint                             * const bp_ids,
-                                skc_uint                                    const bp_mask,
-
-                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                                __global skc_ttsk_s_t                     * const sk_extent,
-
-                                __global float8                  const    * const transforms, // FIXME -- __constant
-                                __global float4                  const    * const clips,      // FIXME -- __constant
-                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                                skc_uint                                    const count)
-{
-  ;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/rasters_alloc.cl b/src/compute/skc/rasters_alloc.cl
deleted file mode 100644
index f8f76a7b39..0000000000
--- a/src/compute/skc/rasters_alloc.cl
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "raster_builder_cl_12.h"
-#include "block_pool_cl.h"
-#include "atomic_cl.h"
-#include "raster.h"
-#include "tile.h"
-
-//
-// There is a fixed-size meta table per raster cohort that we use to
-// peform a mostly coalesced sizing and allocation of blocks.
-//
-// This code is simple and fast.
-//
-
-__kernel
-SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
-                         __global skc_block_id_t  const    * const bp_ids,
-                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring
-                         __global skc_block_id_t           * const map,
-                         __global skc_uint                 * const metas,
-                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT
-                         skc_uint                            const count)
-{
-  // access to the meta extent is linear
-  skc_uint const gid       = get_global_id(0);
-  skc_bool const is_active = gid < count;
-
-  //
-  // init with defaults for all lanes
-  //
-  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };
-  skc_uint                           raster_id    = SKC_UINT_MAX;
-  skc_uint                           extra_blocks = 0;
-
-  if (is_active)
-    {
-      // load meta_in
-      meta.in.u32v4     = vload4(gid,metas);
-
-      // load raster_id as early as possible
-      raster_id         = raster_ids[gid];
-
-#if 0
-      printf("%3u + %5u, %5u, %5u, %5u\n",
-             gid,
-             meta.in.blocks,
-             meta.in.offset,
-             meta.in.pk,
-             meta.in.rk);
-#endif
-
-      // how many blocks will the ttpb blocks consume?
-      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 
-                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
-
-      // total keys
-      meta.out.keys    += meta.in.pk;
-
-      // how many blocks do we need to store the keys in the head and trailing nodes?
-      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
-                           (SKC_RASTER_NODE_DWORDS - 1));
-      // increment blocks
-      extra_blocks     += hn;
-
-      // how many nodes trail the head?
-      meta.out.nodes    = hn - 1;
-      
-      // update blocks
-      meta.out.blocks  += extra_blocks;
-
-#if 0
-      printf("%3u - %5u, %5u, %5u, %5u\n",
-             gid,
-             meta.out.blocks,
-             meta.out.offset,
-             meta.out.nodes,
-             meta.out.keys);
-#endif
-    }
-
-  //
-  // allocate blocks from block pool
-  //
-  // first perform a prefix sum on the subgroup to reduce atomic
-  // operation traffic
-  //
-  // note this idiom can be implemented with vectors, subgroups or
-  // workgroups
-  //
-  
-  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
-  skc_uint       reads  = 0;
-
-  // last lane performs the block pool allocation with an atomic increment
-  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
-    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
-  }
-
-  // broadcast block pool base to all lanes
-  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
-
-  // update base for each lane
-  reads += prefix - extra_blocks;
-
-  //
-  // store meta header
-  //
-  if (is_active)
-    {
-      // store headers back to meta extent
-      vstore4(meta.out.u32v4,gid,metas);
-
-      // store reads
-      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 
-
-      // get block_id of each raster head 
-      skc_block_id_t const block_id = bp_ids[reads & bp_mask];
-
-      // update map
-      map[raster_id] = block_id;
-
-#if 0
-      printf("alloc: %u / %u\n",raster_id,block_id);
-#endif
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/rasters_reclaim.cl b/src/compute/skc/rasters_reclaim.cl
deleted file mode 100644
index f0abdb0381..0000000000
--- a/src/compute/skc/rasters_reclaim.cl
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "block_pool_cl.h"
-#include "atomic_cl.h"
-#include "block.h"
-#include "raster.h"
-#include "common.h"
-#include "tile.h"
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
-
-//
-//
-//
-
-#if   ( SKC_RASTERS_RECLAIM_X == 1 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_RASTERS_RECLAIM_X == 2 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_RASTERS_RECLAIM_X == 4 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_RASTERS_RECLAIM_X == 8 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_RASTERS_RECLAIM_X == 16)
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_RASTERS_RECLAIM_X"
-#endif
-
-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
-                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
-
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
-  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \
-  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \
-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \
-  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
-
-#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \
-  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \
-  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \
-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
-   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \
-  S = sub_group_scan_exclusive_add(C)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \
-  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
-  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
-                           __global skc_uint                * const bp_elems,    // block pool blocks
-                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics
-                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
-                           __global skc_block_id_t const    * const map,         // raster host-to-device map
-                           struct   skc_reclaim               const reclaim)     // array of host raster ids
-{
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const reclaim_stride = get_num_sub_groups();
-#else
-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
-  //
-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
-  //
-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
-    {
-      // get host raster id
-      skc_raster_h const raster = reclaim.aN[reclaim_idx];
-
-      // get block id of raster header
-      skc_block_id_t     id     = map[raster];
-
-      //
-      // load all of the head block ttxk.lo keys into registers
-      //
-      // FIXME -- this pattern lends itself to using the higher
-      // performance Intel GEN block load instructions
-      //
-      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // pick out count.nodes and count.prims from the header
-      //
-      // load raster header counts -- we only need the blocks and
-      // nodes words the keys are doublewords.
-      //
-      // FIXME -- this can be made portable with compile-time macro expansion
-      //
-      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
-      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-#if 0
-      if (get_sub_group_local_id() == 0) {
-        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
-      }
-#endif
-      //
-      // acquire a span in the block pool ids ring for reclaimed ids
-      //
-      skc_uint bp_ids_base = 0;
-
-      if (get_sub_group_local_id() == 0) {
-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-      }
-
-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
-      //
-      // mask off everything but the block id
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // swap current id with next
-      //
-      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-        {
-          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-          id = next;
-#if 0
-          printf("rasters next = %u\n",id);
-#endif
-        }
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        printf("%08X %u\n",h##I,h##I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-      
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        printf("%08X\n",h##I);                          \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-      //
-      // - we'll skip subgroups that are entirely header
-      //
-      // - but we need to mark any header elements that partially fill
-      //   a subgroup as subblocks
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \
-          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \
-            h##I = SKC_UINT_MAX;                        \
-          }                                             \
-        }                                               \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      {
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
-          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = h##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-      }
-
-      // printf("R %7u ! %u\n",bp_ids_idx,h##I);
-            
-      //
-      // we're done if it was just the header
-      //
-      if (count_nodes == 0)
-        return;
-
-      //
-      // otherwise, walk the nodes
-      //
-      do {
-        // id of next block is in last lane
-        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
-
-        //
-        // load all of the node block ttxk.lo keys into registers
-        //
-        // FIXME -- this pattern lends itself to using the higher
-        // performance Intel GEN block load instructions
-        //
-        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // mask off everything but the block id
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        n##I = n##I & SKC_TTXK_LO_MASK_ID;
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // swap current id with next
-        //
-        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-          {
-            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-            id = next;
-#if 0
-            printf("rasters next = %u\n",id);            
-#endif
-          }
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        printf("%08X %u\n",n##I,n##I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = n##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("R %7u ! %u\n",bp_ids_idx,n##I);
-        
-        // any more nodes?
-      } while (--count_nodes > 0);
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/render.cl b/src/compute/skc/render.cl
deleted file mode 100644
index ba2fd7bbfd..0000000000
--- a/src/compute/skc/render.cl
+++ /dev/null
@@ -1,2165 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12_gen9.h"
-#include "block.h"
-#include "tile.h"
-#include "atomic_cl.h"
-#include "styling_types.h"
-
-//
-//
-//
-
-#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
-#endif
-
-//
-// tile state flag bits
-//
-
-typedef enum skc_tile_flags_e {
-
-  // FLUSH
-  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
-  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
-  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
-
-  // OPACITY
-  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
-
-  //
-  // Note: testing for opacity and skipping scattering is on its way
-  // to becoming a much more programmable option because sometimes we
-  // may be compositing/blending from back-to-front and/or be using
-  // group blend rules that ignore opacity.
-  //
-  // The point is that all of these decisions should be encoded in
-  // styling commands and, as much as possible, removed from the final
-  // group/layer styling traversal render loop.
-  //
-
-} skc_tile_flags_e;
-
-//
-// COVER -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_cover
-{
-  struct {
-    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
-  } aN;
-
-#ifdef SKC_RENDER_TILE_COVER_VECTOR
-  struct {
-    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
-  } vN;
-#endif
-};
-
-//
-// COLOR -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_color
-{
-  union {
-    struct {
-      SKC_RENDER_TILE_COLOR           r;
-      SKC_RENDER_TILE_COLOR           g;
-      SKC_RENDER_TILE_COLOR           b;
-      SKC_RENDER_TILE_COLOR           a;
-    } rgba[SKC_TILE_WIDTH];
-  } aN;
-
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-  union {
-    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
-  } iN;
-#endif
-
-#ifdef SKC_RENDER_TILE_COLOR_VECTOR
-  union {
-    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
-  } vN;
-#endif
-
-  struct {
-    union {
-      struct {
-        SKC_RENDER_TILE_COLOR         r;
-        SKC_RENDER_TILE_COLOR         g;
-      };
-      SKC_RENDER_GRADIENT_FLOAT       distance;
-    };
-    union {
-      struct {
-        SKC_RENDER_TILE_COLOR         b;
-        SKC_RENDER_TILE_COLOR         a;
-      };
-      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
-    };
-  } grad[SKC_TILE_WIDTH];
-};
-
-//
-// SHARED MEMORY STATE
-//
-
-#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
-
-#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
-#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
-
-//
-//
-//
-
-union skc_subgroup_smem
-{
-  //
-  // The tiles are stored in column-major / height-major order
-  //
-  // The final column is a guard column that is OK to write to but
-  // will never be read.  It simplifies the TTSB scatter but could be
-  // predicated if SMEM is really at a premium.
-  //
-#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
-  struct {
-    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
-  } atomic;
-#endif
-
-  struct {
-    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
-  } aN;
-
-  struct { // assumption is that height = subgroup
-    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
-  } vN;
-
-  struct { // assumption is that height = subgroup
-    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
-  } wide;
-
-  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
-
-  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
-
-#if 0
-  //
-  // SPILL TO GMEM
-  //
-#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
-  struct {
-
-#if (SKC_REGS_COLOR_S > 0)
-    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
-#if (SKC_REGS_COVER_S > 0)
-    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
-  } regs;
-#endif
-  //
-  //
-  //
-#endif
-};
-
-//
-//
-//
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-
-#define skc_subgroup_lane()  0
-
-#else
-
-#define skc_subgroup_lane()  get_sub_group_local_id()
-
-#endif
-
-//
-//
-//
-
-typedef skc_uint  skc_ttsk_lo_t;
-typedef skc_uint  skc_ttsk_hi_t;
-
-typedef skc_uint  skc_ttpk_lo_t;
-typedef skc_uint  skc_ttpk_hi_t;
-
-typedef skc_uint  skc_ttxk_lo_t;
-typedef skc_uint  skc_ttxk_hi_t;
-
-typedef skc_uint  skc_ttck_lo_t;
-typedef skc_uint  skc_ttck_hi_t;
-
-typedef skc_uint2 skc_ttck_t;
-
-typedef skc_int   skc_ttxb_t;
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   18  |  7  |  7  |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   15  |  9  |  8  |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          27          |    1   |    1   |   18  |  9  |  8  |
-//
-
-static
-skc_uint
-skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
-{
-  return a & SKC_TTCK_LO_MASK_ID;
-}
-
-static
-skc_layer_id
-skc_ttck_get_layer(skc_ttck_t const a)
-{
-  //
-  // FIXME -- a union with a ulong and a shift down and mask is
-  // probably faster on some architectures
-  //
-  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
-  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
-
-  return lo | hi;
-}
-
-static
-skc_uint
-skc_ttck_hi_get_x(skc_ttck_hi_t const a)
-{
-  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
-}
-
-static
-skc_uint
-skc_ttck_hi_get_y(skc_ttck_hi_t const a)
-{
-  return a >> SKC_TTCK_HI_OFFSET_Y;
-}
-
-static
-skc_bool
-skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
-{
-  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
-  skc_uint const hi = (a.hi ^ b.hi);
-
-  return (lo | hi) == 0;
-}
-
-static
-skc_bool
-skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
-{
-  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
-}
-
-static
-skc_bool
-skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
-{
-  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
-}
-
-//
-// TILE TRACE SUBPIXEL
-//
-// The subpixels are encoded with either absolute tile coordinates
-// (32-bits) or packed in delta-encoded form form.
-//
-// For 32-bit subpixel packing of a 32x32 tile:
-//
-// A tile X is encoded as:
-//
-//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
-//
-//   SX :  6 : unsigned subpixel span from min to max x with range
-//             [0,32]. The original direction is not captured. Would
-//             be nice to capture dx but not necessary right now but
-//             could be in the future. <--- SPARE VALUES AVAILABLE
-//
-// A tile Y is encoded as:
-//
-//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
-//
-//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
-//             [-32,32] but horizontal lines are not encoded so [1,32]
-//             is mapped to [0,31]. The resulting range [-32,31] fits
-//             in 6 bits.
-//
-// TTS:
-//
-//  0                        31
-//  |  TX |  SX  |  TY |  DY  |
-//  +-----+------+-----+------+
-//  |  10 |   6  |  10 |   6  |
-//
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
-{
-  //
-  // extract the whole pixel y coordinate
-  //
-  return SKC_BFE(a,
-                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
-                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
-{
-  //
-  // get the linear array tile index of the pixel
-  //
-  return (((a & SKC_TTS_MASK_TX_PIXEL)
-
-#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
-           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
-#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
-           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
-#endif
-
-           ) | skc_tts_get_ty_pixel_v(a));
-}
-
-#if 0
-static
-skc_ttx_v_s32_t
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
-  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
-
-  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
-}
-#else
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
-  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
-
-  return dy - (~a >> 31);
-}
-#endif
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
-{
-  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
-{
-  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
-{
-  //
-  // SIMD / CPU
-  //
-  //      &
-  //
-  // SIMT / GPU
-  //
-  // Note that atomic_init() is likely implemented as a simple
-  // assignment so there is no identifiable performance difference on
-  // current targets.
-  //
-  // If such an architecture appears in the future then we'll probably
-  // still want to implement this zero'ing operation as below but
-  // follow with an appropriate fence that occurs before any scatter
-  // operations.
-  //
-  // The baroque expansion below improves performance on Intel GEN by,
-  // presumably, achieving the 64-byte per clock SLM write as well as
-  // minimizing the overall number of SEND() block initializations and
-  // launches.
-  //
-  // Intel GENx has a documented 64 byte per cycle SLM write limit.
-  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
-  // probably a safe bet (Later: benchmarking backs this up!).
-  //
-  // Note there is no reason at this time to unroll this loop.
-  //
-  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
-    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
-}
-
-//
-// Note this is going to be vectorizable on most architectures.
-//
-// The return of the key translation feature might complicate things.
-//
-
-static
-void
-skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
-                 skc_block_id_t                                  const pb_id)
-{
-  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
-
-#if   ( SKC_TILE_RATIO == 1 )
-
-  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
-
-#elif ( SKC_TILE_RATIO == 2 )
-
-  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
-
-#else
-
-#error("tile ratio greater than 2 not supported")
-
-#endif
-
-  //
-  // Note there is no need to use an atomic for this operation on the
-  // current group of target platforms... but this may change if
-  // atomic ops truly go through a different path.
-  //
-  // As noted above, this direct increment is probably faster and can
-  // always be followed by a fence.
-  //
-  // Furthermore, note that the key sorting orders all ttck keys
-  // before ttpk keys.
-  //
-
-  //
-  // FIXME -- if the SMEM store is wider than bank word count then we
-  // might want to odd-even interleave the TTP values if the target
-  // device can't handle 64-bit stores
-  //
-
-  //
-  // skipping per-key translation for now
-  //
-  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
-}
-
-//
-// Note that skc_scatter_ttsb is *not* vectorizable unless the
-// architecture supports a "scatter-add" capability.  All relevant
-// GPUs support atomic add on shared/local memory and thus support
-// scatter-add.
-//
-
-static
-void
-skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
-                 skc_block_id_t                                  const sb_id)
-{
-  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
-  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
-
-  //
-  // Skipping per-key translation for now
-  //
-
-  // Index into tile
-  //
-  // The tiles are stored in column-major / height-major order
-  //
-  // The final column is a guard column that is OK to write to but
-  // will never be read.  It simplifies the TTSB scatter but could be
-  // predicated if SMEM is really at a premium.
-  //
-
-  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
-
-#if 0
-  if (tts_v != SKC_TTS_INVALID)
-    printf("(%08X) = %u\n",tts_v,xy_idx);
-#endif
-
-  //
-  // adjust subpixel range to max y
-  //
-  // range is stored as [-32,31] and when read [0,31] is mapped to
-  // [1,32] because a dy of 0 is not possible.
-  //
-  // more succinctly: if dy >= 0 then ++dy
-  //
-  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
-
-  //
-  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
-  //
-
-  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
-  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
-
-  // Calculate left and right coverage contribution trapezoids
-  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
-  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
-
-  //
-  // Accumulate altitudes and areas
-  //
-  // Optimization: if the device supports an CPU/SIMD vector-add or
-  // GPU/SIMT scatter-add atomic int2 add operation then placing the
-  // ALT and AREA values side-by-side would halve the number of
-  // additions.
-  //
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-  //
-  // CPU/SIMD
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                 \
-  if (tts_v C != SKC_TTS_INVALID) {                             \
-    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
-    smem->aN.area[                  xy_idx C] += right C;       \
-  }
-
-#else
-  //
-  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-  if (tts_v C != SKC_TTS_INVALID) {                                     \
-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
-                                          SKC_TILE_HEIGHT   + xy_idx C, \
-                                          left C);                      \
-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
-                                          right C);                     \
-  }
-#endif
-
-  SKC_RENDER_TTSB_EXPAND();
-}
-
-//
-// Note that 2048.0 can be represented exactly with fp16... fortuitous!
-//
-
-#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
-#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
-#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
-#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
-                       union skc_tile_cover            * SKC_RESTRICT const cover,
-                       union skc_tile_color            * SKC_RESTRICT const color)
-{
-  SKC_RENDER_ACC_COVER_INT area = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
-      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
-
-      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
-    }
-}
-
-static
-void
-skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
-                       union skc_tile_cover            * SKC_RESTRICT const cover,
-                       union skc_tile_color            * SKC_RESTRICT const color)
-{
-  SKC_RENDER_ACC_COVER_INT area = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
-      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
-
-      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                          uint                                 * SKC_RESTRICT const cmd_next,
-                          union skc_tile_color                 * SKC_RESTRICT const color)
-{
-  //
-  // rgba = solid fill
-  //
-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
-  *cmd_next += 2;
-
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].r = rg.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].g = rg.hi;
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].b = ba.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].a = ba.hi;
-
-#else
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
-
-  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
-
-  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
-
-#endif
-}
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-//    t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-//    a + (b - a) * t
-//
-// But this may be a native instruction on some devices.  For example,
-// on GEN9 there is an LRP "linear interoplation" function but it
-// doesn't appear to support half floats.
-//
-
-#if 1
-#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t)  mix(a,b,t)
-#endif
-
-//
-// CPUs have a mock local address space so copying the gradient header
-// is probably not useful.  Just read directly from global.
-//
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
-#define SKC_RENDER_GRADIENT_SPACE  __local
-#else
-#define SKC_RENDER_GRADIENT_SPACE  __global
-#endif
-
-//
-// gradient is non-vertical
-//
-// removed the vertical (actually, horizontal) special case
-//
-
-static
-void
-skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
-                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                                                uint                                 * SKC_RESTRICT const cmd_next,
-                                                union skc_tile_color                 * SKC_RESTRICT const color,
-                                                skc_ttck_hi_t                                       const ttck_hi)
-{
-  //
-  // Where is this tile?
-  //
-  // Note that the gradient is being sampled from pixel centers.
-  //
-  SKC_RENDER_GRADIENT_FLOAT const y =
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
-    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
-    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
-
-  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
-
-  //
-  // Get starting numerator and denominator
-  //
-  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
-  // gradient and can be handled by a special opcode.
-  //
-  // Note: the mad() ordering is slightly different than the original
-  // CUDA implementation.
-  //
-  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
-
-  *cmd_next += 4;
-
-  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
-  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
-
-  //
-  // Where are columns along gradient vector?
-  //
-  // TODO: Note that the gv_denom isn't multiplied through.
-  //
-  // Please doublecheck this... but I recall that in certain cases
-  // this wipes out some precision and results in minor but noticeable
-  // gradient artifacts.
-  //
-  // All arguments are scalars except gv_numer so a simpler
-  // evaluation might save some flops.
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
-
-  //
-  // is gradient non-repeating, repeating or reflecting?
-  //
-  switch (commands[(*cmd_next)++].u32)
-    {
-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
-      break;
-
-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].distance -= floor(color->grad[ii].distance);
-      break;
-
-    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
-      //
-      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
-      //
-      // Note: OpenCL "rint()" is round-to-nearest-even integer!
-      //
-      // Note: the floor() "round to -inf" op is implemented in the
-      // GEN op 'FRC' so probably don't use trunc() when floor will
-      // suffice.
-      //
-
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        {
-          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
-          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
-        }
-    }
-
-  //
-  // initialize "stoplerp" for all columns
-  //
-  uint const slope_count = commands[(*cmd_next)++].u32;
-  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
-
-  {
-    float const slope = commands[(*cmd_next)++].f32;
-
-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
-  }
-
-  //
-  // compute stoplerp for remaining stops
-  //
-  for (int jj=1; jj<slope_count; jj++)
-    {
-      float const floor = (float)jj;
-      float const slope = commands[(*cmd_next)++].f32;
-
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
-    }
-
-  //
-  // copy gradient colors to local memory
-  //
-  uint const gd_n = slope_count + 1;
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
-  //
-  // copy entire gradient descriptor to local memory
-  //
-  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
-    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
-
-  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
-#else
-  //
-  // prefetch entire gradient header
-  //
-  // no noticeable impact on performance
-  //
-  // prefetch(&commands[*cmd_next].u32,gh_words);
-  //
-  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
-#endif
-
-  //
-  // adjust cmd_next so that V1 structure is consumed -- FIXME
-  //
-  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
-
-  //
-  // lerp between color pair stops
-  //
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      //
-      // Finally, we have the gradient stop index and the color stop
-      // pair lerp fraction
-      //
-      // Note that if these are vector values then a gather operation
-      // must occur -- there may be platforms (AVX-512?) that can
-      // perform an explicit gather on a vector type but it's not
-      // really expressible in OpenCL except implicitly with a
-      // workgroup of work items.
-      //
-      // ***********************
-      //
-      // FIXME -- USE HERB'S SINGLE FMA LERP
-      //
-      // ***********************
-      //
-      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
-      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
-
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
-      }
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                    union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // fralunco = cover.wip * acc.a
-  //
-  // acc.r    =  fralunco * wip.r + acc.r
-  // acc.g    =  fralunco * wip.g + acc.g
-  // acc.b    =  fralunco * wip.b + acc.b
-  // acc.a    = -fralunco * wip.a + acc.a
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
-
-      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                    union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // cover_min = min(cover.wip,a.acc)
-  //
-  // r.acc =  cover_min * r.wip + r.acc
-  // g.acc =  cover_min * g.wip + g.acc
-  // b.acc =  cover_min * b.wip + b.acc
-  // a.acc = -cover_min * a.wip + a.acc
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
-
-      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                        union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // r.acc = (cover.wip * r.wip) * r.acc
-  // g.acc = (cover.wip * g.wip) * g.acc
-  // b.acc = (cover.wip * b.wip) * b.acc
-  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
-      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
-      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
-      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
-                        union skc_tile_color       * SKC_RESTRICT const color_acc,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                        union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
-  // cover.acc         = cover.acc + cover.wip.contrib
-  //
-  // r.acc =  cover.wip.contrib * r.wip + r.acc
-  // g.acc =  cover.wip.contrib * g.wip + g.acc
-  // b.acc =  cover.wip.contrib * b.wip + b.acc
-  // a.acc = -cover.wip.contrib * a.wip * a.acc
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
-
-      cover_acc->aN.c[ii]     += contrib;
-
-      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
-                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
-                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
-                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
-  //
-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
-  // cover.acc         = cover.acc + cover.wip.contrib
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
-{
-  //
-  // cover.wip *= cover.msk
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 1;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 1 - cover->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 1 - cover->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color->aN.rgba[ii].r = 0;
-      color->aN.rgba[ii].g = 0;
-      color->aN.rgba[ii].b = 0;
-      color->aN.rgba[ii].a = 1;
-    }
-
-#else
-  //
-  // DISABLED ON GEN9 -- probably a compiler bug
-  //
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = 1;
-#endif
-}
-
-static
-void
-skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color->aN.rgba[ii].r = 0;
-      color->aN.rgba[ii].g = 0;
-      color->aN.rgba[ii].b = 0;
-      color->aN.rgba[ii].a = 1;
-    }
-
-#else
-  //
-  // DISABLED ON GEN9 -- probably a compiler bug
-  //
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = 1;
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
-{
-  //
-  // returns true if tile is opaque
-  //
-  // various hacks to test for complete tile opacity
-  //
-  // note that front-to-back currently has alpha at 0.0f -- this can
-  // be harmonized to use a traditional alpha if we want to support
-  // rendering in either direction
-  //
-  // hack -- ADD/MAX/OR all alphas together and test for non-zero
-  //
-  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
-    t += color->aN.rgba[ii].a;
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  return !any(t != ( 0 ));
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
-  //
-  // SIMT - scalar per lane
-  //
-  return !sub_group_any(t != 0);
-
-#else
-  //
-  // SIMT - vector per lane
-  //
-  return !sub_group_any(any(t != ( 0 )));
-
-#endif
-
-  //
-  // TODO: The alternative vector-per-lane implementation below is
-  // *not* believed to be performant because the terse vector-wide
-  // test is just hiding a series of comparisons and is likely worse
-  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
-  // test.
-  //
-#if 0
-  //
-  // SIMT - vector per lane
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    {
-      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
-        return false;
-    }
-
-  return true;
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                         uint                                 * SKC_RESTRICT const cmd_next,
-                         union skc_tile_color                 * SKC_RESTRICT const color)
-{
-  //
-  // acc.r = acc.a * r + acc.r
-  // acc.g = acc.a * g + acc.g
-  // acc.b = acc.a * b + acc.b
-  //
-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
-  *cmd_next += 2;
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
-}
-
-//
-//
-//
-
-// #define SKC_SURFACE_IS_BUFFER
-#ifdef  SKC_SURFACE_IS_BUFFER
-
-static
-void
-skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
-                              skc_uint                                           const surface_pitch,
-                              union skc_tile_color          const * SKC_RESTRICT const color,
-                              skc_ttck_hi_t                                      const ttck_hi)
-{
-  //
-  // NEW MAJOR OPTIMIZATION:
-  //
-  // Rotating and rasterizing the original world transform by -90
-  // degrees and then rendering the scene scene by +90 degrees enables
-  // all the final surface composite to be perfomed in perfectly
-  // coalesced wide transactions.
-  //
-  // For this reason, linear access to the framebuffer is preferred.
-  //
-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
-  //
-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
-  //
-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
-  //
-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
-  //
-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
-  //
-  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
-  uint const x     = skc_ttck_hi_get_x(ttck_hi);
-  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
-  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
-
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
-
-      surface[base + ii * pitch] = rgba;
-
-      // printf("%08v2X\n",rgba);
-    }
-}
-
-#else
-
-static
-void
-skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
-                              union skc_tile_color const * SKC_RESTRICT const color,
-                              skc_ttck_hi_t                                   const ttck_hi)
-{
-  //
-  // NEW MAJOR OPTIMIZATION:
-  //
-  // Rotating and rasterizing the original world transform by -90
-  // degrees and then rendering the scene scene by +90 degrees enables
-  // all the final surface composite to be perfomed in perfectly
-  // coalesced wide transactions.
-  //
-  // For this reason, linear access to the framebuffer is preferred.
-  //
-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
-  //
-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
-  //
-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
-  //
-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
-  //
-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
-  //
-
-#if 1
-  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
-  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                       \
-        SKC_RENDER_SURFACE_WRITE(surface,               \
-                                 (int2)(x,y+I),         \
-                                 color->iN.rgba[ii] A); \
-      }
-
-#else
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                               \
-        SKC_RENDER_SURFACE_COLOR const rgba =                   \
-          (SKC_RENDER_SURFACE_COLOR)                            \
-          (color->aN.rgba[ii].r C,                              \
-           color->aN.rgba[ii].g C,                              \
-           color->aN.rgba[ii].b C,                              \
-           1.0);                                                \
-        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
-      }
-
-#endif
-
-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-      x += 1;
-    }
-#else
-    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
-    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
-
-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-      {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                       \
-        SKC_RENDER_SURFACE_WRITE(surface,               \
-                                 (int2)(x+I,y+ii),      \
-                                 color->iN.rgba[ii] A); \
-      }
-
-#else
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                               \
-      SKC_RENDER_SURFACE_COLOR const rgba =                     \
-        (SKC_RENDER_SURFACE_COLOR)                              \
-        (color->aN.rgba[ii].r C,                                \
-        color->aN.rgba[ii].g C,                                 \
-        color->aN.rgba[ii].b C,                                 \
-        1.0);                                                   \
-      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
-    }
-
-#endif
-
-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-    }
-
-#endif
-}
-
-#endif
-
-//
-//
-//
-static
-uint const
-skc_ttck_lane(uint const ttck_idx)
-{
-  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-}
-
-//
-// RENDER KERNEL
-//
-
-__kernel
-SKC_RENDER_KERNEL_ATTRIBS
-void
-skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
-                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
-                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
-
-                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
-                  skc_uint                                                const ttck_count,   // rename: key_count
-
-                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
-                  skc_uint                                                const tile_count,   // rename: offset_count
-
-                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
-#ifdef SKC_SURFACE_IS_BUFFER
-                  __global   void                          * SKC_RESTRICT const surface,
-#else
-                  __write_only image2d_t                                        surface,
-#endif
-#ifdef SKC_SURFACE_IS_BUFFER
-                  skc_uint                                                const surface_pitch,
-#endif
-                  uint4                                                   const tile_clip)    // rename: clip
-{
-  //
-  // Each subgroup is responsible for a tile.  No extra subgroups are
-  // launched.
-  //
-  // FIXME -- might be better implemented as a "grid stride loop" if
-  // Intel GEN really has a local memory "quantum" of 4KB which means
-  // we would need to launch 4 subgroups per workgroup.
-  //
-  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
-  //
-
-  //
-  // declare tile cover and color registers
-  //
-  // this used to be a neat unified struct but the Intel GEN compiler
-  // wasn't cooperating and spilling to private memory even though all
-  // registers were indexed by constants
-  //
-  union skc_tile_color  color_wip;
-  union skc_tile_color  color_acc;
-
-  union skc_tile_cover  cover_wip;
-  union skc_tile_cover  cover_acc;
-  union skc_tile_cover  cover_msk;
-
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
-  // as a uniform but the alternative calculation used when there are
-  // multiple subgroups per workgroup is not cooperating and
-  // driving spillage elsewhere.
-  //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
-  skc_uint const ttck_offset_idx = get_group_id(0);
-#else
-  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // load the starting ttck for this offset and get a bound on the max
-  // number of keys that might be loaded
-  //
-  // these are uniform across all subgroup lanes
-  //
-  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
-
-  //
-  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
-  // vector of ttck keys
-  //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-
-  skc_ttck_t ttck = ttck_keys[ttck_idx];
-
-#else
-
-  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
-  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
-  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
-
-#endif
-
-  //
-  // set up style group/layer state
-  //
-  struct skc_styling_group {
-    union skc_group_range range;
-    skc_uint              depth;
-    skc_uint              id;
-  } group;
-
-  group.range.lo = 0;
-  group.range.hi = SKC_UINT_MAX;
-  group.depth    = 0;
-  group.id       = SKC_UINT_MAX;
-
-  //
-  // start with clear tile opacity, knockout and flag bits
-  //
-  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
-  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
-  //
-  skc_uint flags = 0;
-
-  //
-  // declare and initialize accumulators
-  //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
-  __local union skc_subgroup_smem                      smem[1];
-#else
-  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
-  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
-#endif
-
-#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-  //
-  // select the initial ttck key
-  //
-  skc_ttck_t ttck;
-#if 0
-  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
-  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
-  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
-#endif
-
-#endif
-
-  //
-  // save the first key so we know what tile we're in
-  //
-  skc_ttck_t ttck0 = ttck;
-
-  //
-  // evaluate the coarse clip as late as possible
-  //
-  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
-
-  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
-    return;
-
-  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
-
-  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
-    return;
-
-#if 0
-  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
-#endif
-
-  //
-  // load -> scatter -> flush
-  //
-  while (true)
-    {
-      // if scattering is disabled then just run through ttck keys
-      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
-
-      // need to clear accumulators before a scatter loop
-      if (is_scatter_enabled)
-        {
-          skc_tile_aa_zero(smem);
-        }
-
-      do {
-        // skip scattering?
-        if (is_scatter_enabled)
-          {
-            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
-
-            if (skc_ttck_lo_is_prefix(ttck.lo)) {
-              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
-            } else {
-              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
-            }
-          }
-
-        //
-        // any ttck keys left?
-        //
-        if (++ttck_idx >= ttck_count)
-          {
-            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
-            break;
-          }
-
-        //
-        // process next ttck key
-        //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-        //
-        // SIMD -- read next key
-        //
-        ttck = ttck_keys[ttck_idx];
-#else
-        //
-        // SIMT -- refresh the ttck_s?
-        //
-        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-
-        if (ttck_lane_next == 0)
-          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
-
-        //
-        // broadcast next key to entire subgroup
-        //
-#if 0
-        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
-        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
-        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
-#endif
-#endif
-        // continue scattering if on same YXL layer
-      } while (skc_ttck_equal_yxl(ttck0,ttck));
-
-      // finalize if no longer on same YX tile
-      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
-        {
-          // otherwise, unwind the tile styling and exit
-          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
-        }
-
-      //
-      // given: new layer id from ttxk key
-      //
-      // load [layer id]{ group id, depth }
-      //
-      // if within current group's layer range
-      //
-      //   if at same depth
-      //
-      //     load and execute cover>[mask>]color>blend commands
-      //
-      //   else if not at same depth then move deeper
-      //
-      //     for all groups in group trail from cur depth to new depth
-      //       enter group, saving and initializing regs as necessary
-      //     increment depth and update layer range
-      //     load and execute cover>[mask>]color>blend commands
-      //
-      // else not within layer range
-      //
-      //   exit current group, restoring regs as necessary
-      //   decrement depth and update layer range
-      //
-      //
-      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
-      union skc_layer_node const layer_node_new = layers[layer_id_new];
-
-      // clear flag that controls group/layer traversal
-      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
-
-      do {
-        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
-
-        //
-        // is layer a child of the current parent group?
-        //
-        uint cmd_next = 0;
-
-        if (!unwind && (layer_node_new.parent == group.id))
-          {
-            // execute this layer's cmds
-            cmd_next = layer_node_new.cmds;
-
-            // if this is final then configure so groups get unwound, otherwise we're done
-            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
-          }
-        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
-          {
-            //
-            // is layer in a child group?
-            //
-            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
-            uint                    const gn = gp.depth - ++group.depth;
-
-            if (gn == 0)
-              group.id = layer_node_new.parent;
-            else
-              group.id = commands[gp.base + gn - 1].parent;
-
-            // update group layer range
-            group.range = groups[group.id].range;
-
-            // enter current group
-            cmd_next    = groups[group.id].cmds.enter;
-          }
-        else // otherwise, exit this group
-          {
-            // enter current group
-            cmd_next = groups[group.id].cmds.leave;
-
-            // decrement group depth
-            if (--group.depth == 0)
-              {
-                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
-              }
-            else
-              {
-                // get path_base of current group
-                uint const gnpb = groups[group.id].parents.base;
-
-                // get parent of current group
-                group.id    = commands[gnpb].parent;
-
-                // update group layer range
-                group.range = groups[group.id].range;
-              }
-          }
-
-        //
-        // execute cmds
-        //
-        while (true)
-          {
-            union skc_styling_cmd const cmd = commands[cmd_next++];
-
-            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
-              {
-              case SKC_STYLING_OPCODE_NOOP:
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_NONZERO:
-                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_EVENODD:
-                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
-                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK:
-                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
-                skc_tile_cover_wip_zero(&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
-                skc_tile_cover_acc_zero(&cover_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
-                skc_tile_cover_msk_zero(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
-                skc_tile_cover_msk_one(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
-                skc_tile_cover_msk_invert(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
-                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
-                //
-                // FIXME -- gradients shouldn't be executing so much
-                // conditional driven code at runtime since we *know*
-                // the gradient style on the host can just create a
-                // new styling command to exploit this.
-                //
-                // FIXME -- it might be time to try using the GPU's
-                // sampler on a linear array of half4 vectors -- it
-                // might outperform the explicit load/lerp routines.
-                //
-                // FIXME -- optimizing for vertical gradients (uhhh,
-                // they're actually horizontal due to the -90 degree
-                // view transform) is nice but is it worthwhile to
-                // have this in the kernel?  Easy to add it back...
-                //
-#if defined( SKC_ARCH_GEN9 )
-                // disable gradients due to exessive spillage -- fix later
-                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
-#else
-                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
-#endif
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
-                skc_tile_color_wip_zero(&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
-                skc_tile_color_acc_zero(&color_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_OVER:
-                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_PLUS:
-                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
-                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
-                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
-                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
-                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
-                skc_tile_background_over(commands,&cmd_next,&color_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
-#ifdef SKC_SURFACE_IS_BUFFER
-                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
-#else
-                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
-#endif
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
-                if (skc_tile_color_test_opacity(&color_acc))
-                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
-                break;
-
-              default:
-                return; // this is an illegal opcode -- trap and die!
-              }
-
-            //
-            // if sign bit is set then this was final command
-            //
-            if (cmd.s32 < 0)
-              break;
-          }
-
-        // continue as long as tile flush isn't complete
-      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
-
-      // return if was the final flush
-      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
-        return;
-
-      // update wip ttck_hi
-      ttck0 = ttck;
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/runtime_cl.c b/src/compute/skc/runtime_cl.c
deleted file mode 100644
index a745ed013e..0000000000
--- a/src/compute/skc/runtime_cl.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-
-//
-//
-//
-
-#include "runtime_cl.h"
-#include "common/cl/assert_cl.h"
-
-//
-//
-//
-
-static is_verbose = true;
-
-//
-// FIXME -- all variable length device queries need to start querying
-// the parameter's return size before getting its value
-//
-// FIXME -- this is now handled by the common/cl/find.* routine
-//
-
-union skc_cl_device_version {
-  struct {
-    cl_uchar opencl_space[7]; // "OpenCL_"
-    cl_uchar major;
-    cl_uchar dot;
-    cl_uchar minor;
-#if 1 // Intel NEO requires at least 16 bytes
-    cl_uchar space;
-    cl_uchar vendor[32];
-#endif
-  };
-  struct {
-    cl_uchar aN[];
-  };
-};
-
-typedef cl_bitfield cl_diagnostic_verbose_level_intel;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
-
-static
-void 
-CL_CALLBACK 
-skc_context_callback(char const * error, void const * info, size_t size, void * user)
-{
-  if (info != NULL )
-    {
-      fprintf(stderr,"%s\n",error);
-    }
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[])
-{
-  skc_err err = SKC_ERR_SUCCESS;
-  
-  //
-  // search available devices for a match
-  //
-#define PLATFORM_IDS_MAX         16
-#define DEVICE_IDS_MAX           16
-#define PLATFORM_NAME_SIZE_MAX   64
-#define DEVICE_NAME_SIZE_MAX     64
-#define DRIVER_VERSION_SIZE_MAX  64
-
-  cl_int         cl_err;
-
-  cl_platform_id platform_ids[PLATFORM_IDS_MAX];
-  cl_device_id   device_ids  [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
-
-  cl_uint        platform_count;
-  cl_uint        device_count[PLATFORM_IDS_MAX];
-  
-  cl_uint        platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
-
-  bool           match = false; // find _first_ match
-
-  //
-  // get number of platforms
-  //
-  cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
-
-  //
-  // search platforms
-  //
-  for (cl_uint ii=0; ii<platform_count; ii++)
-    {
-      char platform_name[PLATFORM_NAME_SIZE_MAX];
-
-      cl(GetPlatformInfo(platform_ids[ii],
-                         CL_PLATFORM_NAME,
-                         sizeof(platform_name),
-                         platform_name,
-                         NULL));
-
-      if (!match && (strstr(platform_name,target_platform_substring) != NULL)) 
-        {
-          platform_idx = ii;
-        }
-
-      if (is_verbose) {
-        fprintf(stdout,"%2u: %s\n",ii,platform_name);
-      }
-
-      cl_err = clGetDeviceIDs(platform_ids[ii],
-                              CL_DEVICE_TYPE_ALL,
-                              DEVICE_IDS_MAX,
-                              device_ids[ii],
-                              device_count+ii);
-
-      if (cl_err != CL_DEVICE_NOT_FOUND)
-        cl_ok(cl_err);
-
-      for (cl_uint jj=0; jj<device_count[ii]; jj++)
-        {
-          char                        device_name[DEVICE_NAME_SIZE_MAX];
-          union skc_cl_device_version device_version;
-          cl_uint                     device_align_bits;
-          char                        driver_version[DRIVER_VERSION_SIZE_MAX];
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_NAME,
-                           sizeof(device_name),
-                           device_name,
-                           NULL));
-
-          // FIXME -- some of these variable length parameters should
-          // use the "size the param before reading" idiom
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_VERSION,
-                           sizeof(device_version),
-                           device_version.aN,
-                           NULL));
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_MEM_BASE_ADDR_ALIGN,
-                           sizeof(device_align_bits),
-                           &device_align_bits,
-                           NULL));
-          
-          cl_uint const base_align = device_align_bits / 8; // bytes
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DRIVER_VERSION,
-                           sizeof(driver_version),
-                           driver_version,
-                           NULL));
-          
-          if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
-            {
-              match      = true;
-              device_idx = jj;
-
-              runtime_cl->version.major = device_version.major - 48;
-              runtime_cl->version.minor = device_version.minor - 48;
-              runtime_cl->base_align    = base_align;
-
-              if (is_verbose) {
-                fprintf(stdout," >>>");
-              }
-            }
-          else if (is_verbose) 
-            {
-              fprintf(stdout,"    ");
-            }
-
-          if (is_verbose) {
-            fprintf(stdout,
-                    " %1u: %s [ %s ] [ %s ] [ %u ]\n",
-                    jj,
-                    device_name,
-                    device_version.aN,
-                    driver_version,
-                    base_align);
-          }
-        }
-    }
-
-  if (is_verbose) {
-    fprintf(stdout,"\n");
-  }
-
-  //
-  // get target platform and device
-  //
-  if (platform_idx >= platform_count)
-    {
-      fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
-      exit(EXIT_FAILURE);
-    }
-  if (device_idx >= device_count[platform_idx])
-    {
-      fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
-      exit(EXIT_FAILURE);
-    }
-
-  runtime_cl->platform_id = platform_ids[platform_idx];
-  runtime_cl->device_id   = device_ids  [platform_idx][device_idx];
-
-  //
-  // create context
-  //
-
-#if 0
-  cl_context_properties context_properties[] = 
-    { 
-      CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
-      0 
-    };
-#else
-  context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
-#endif
-
-  runtime_cl->context = clCreateContext(context_properties,
-                                    1,
-                                    &runtime_cl->device_id,
-                                    skc_context_callback,
-                                    NULL,
-                                    &cl_err);
-  cl_ok(cl_err);
-
-  //
-  // get device name, driver version, and unified memory flag
-  //
-  if (is_verbose)
-    {
-      char                       device_name[DEVICE_NAME_SIZE_MAX];
-      char                       driver_version[DRIVER_VERSION_SIZE_MAX];
-      cl_bool                    device_is_unified; 
-      cl_device_svm_capabilities svm_caps;
-      size_t                     printf_buffer_size;
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_NAME,
-                       sizeof(device_name),
-                       device_name,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DRIVER_VERSION,
-                       sizeof(driver_version),
-                       driver_version,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_HOST_UNIFIED_MEMORY,
-                       sizeof(device_is_unified),
-                       &device_is_unified,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_SVM_CAPABILITIES,
-                       sizeof(svm_caps),
-                       &svm_caps,
-                       0));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_PRINTF_BUFFER_SIZE,
-                       sizeof(printf_buffer_size),
-                       &printf_buffer_size,
-                       NULL));
-
-      fprintf(stderr,
-              "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER  %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_BUFFER    %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM    %c\n"
-              "CL_DEVICE_SVM_ATOMICS              %c\n"
-              "CL_DEVICE_PRINTF_BUFFER_SIZE       %zu\n\n",
-              svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_ATOMICS             ? '*' : '-',
-              printf_buffer_size);
-    }
-
-  return err;
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
-{
-  // FIXME
-  printf("%s incomplete!\n",__func__);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
-{
-  cl_command_queue cq;
-
-  if (runtime_cl->version.major < 2)
-    {
-      //
-      // <= OpenCL 1.2
-      //
-      cl_int cl_err;
-
-      cq = clCreateCommandQueue(runtime_cl->context,
-                                runtime_cl->device_id,
-                                (cl_command_queue_properties)type,
-                                &cl_err); cl_ok(cl_err);  
-    }
-  else
-    {
-      //
-      // >= OpenCL 2.0
-      //
-      cl_int                    cl_err;
-      cl_queue_properties const queue_properties[] = {
-        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
-      };
-
-      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
-                                              runtime_cl->device_id,
-                                              queue_properties,
-                                              &cl_err); cl_ok(cl_err);
-    }
-
-  return cq;
-}
-
-//
-//
-//
-
diff --git a/src/compute/skc/runtime_cl.h b/src/compute/skc/runtime_cl.h
deleted file mode 100644
index 9e58ca0cc7..0000000000
--- a/src/compute/skc/runtime_cl.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-// squelch OpenCL 1.2 deprecation warning
-//
-
-#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#endif
-
-#include <CL/opencl.h>
-
-//
-//
-//
-
-#include "skc.h"
-
-//
-// Minimal OpenCL state needed by the runtime to get started
-//
-
-struct skc_runtime_cl
-{
-  cl_platform_id platform_id;
-  cl_device_id   device_id;
-  cl_context     context;
-  
-  struct {
-    cl_uint      major;
-    cl_uint      minor;
-  } version; // sometimes we need to know this at runtime 
-
-  cl_uint        base_align; // base address alignment for subbuffer origins
-};
-
-//
-//
-//
-
-typedef enum skc_cq_type_e {
-  SKC_CQ_TYPE_IN_ORDER               = 0,
-  SKC_CQ_TYPE_OUT_OF_ORDER           = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-  SKC_CQ_TYPE_IN_ORDER_PROFILING     = (SKC_CQ_TYPE_IN_ORDER     | CL_QUEUE_PROFILING_ENABLE),
-  SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
-} skc_cq_type_e;
-
-//
-// safely creates a generic OpenCL target in very few lines
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[]);
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
-
-//
-// create a command queue with the non-deprecated function
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
-
-//
-//
-//
-
diff --git a/src/compute/skc/runtime_cl_12.c b/src/compute/skc/runtime_cl_12.c
deleted file mode 100644
index fca13edbbd..0000000000
--- a/src/compute/skc/runtime_cl_12.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-//
-//
-//
-
-#include "context.h"
-#include "block.h"
-#include "grid.h"
-#include "common/cl/assert_cl.h"
-#include "config_cl.h"
-#include "runtime_cl.h"
-#include "runtime_cl_12.h"
-#include "export_cl_12.h"
-
-//
-//
-//
-
-static 
-void
-skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
-{
-  // save size
-  runtime->block_pool.size = &runtime->config->block_pool;
-
-  // create block extent
-  skc_extent_pdrw_alloc(runtime,
-                        &runtime->block_pool.blocks,
-                        runtime->block_pool.size->pool_size * 
-                        runtime->config->block.bytes);
-
-  // allocate block pool ids
-  skc_extent_pdrw_alloc(runtime,
-                        &runtime->block_pool.ids,
-                        runtime->block_pool.size->ring_pow2 * sizeof(skc_uint));
-
-  // allocate block pool atomics
-  skc_extent_phr_pdrw_alloc(runtime,
-                            &runtime->block_pool.atomics,
-                            sizeof(union skc_block_pool_atomic));
-
-  // acquire pool id and atomic initialization kernels
-  cl_kernel k0 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS);
-  cl_kernel k1 = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS);
-
-  // init ids
-  cl(SetKernelArg(k0,0,sizeof(runtime->block_pool.ids.drw),&runtime->block_pool.ids.drw));
-  cl(SetKernelArg(k0,1,SKC_CL_ARG(runtime->block_pool.size->pool_size)));
-
-  // the kernel grid is shaped by the target device -- always 2 for atomics
-  skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS,
-                            cq,k0,runtime->block_pool.size->pool_size,
-                            0,NULL,NULL);
-
-  // init atomics
-  cl(SetKernelArg(k1,0,sizeof(runtime->block_pool.atomics.drw),&runtime->block_pool.atomics.drw));
-  cl(SetKernelArg(k1,1,SKC_CL_ARG(runtime->block_pool.size->pool_size)));
-
-  // the kernel grid is shaped by the target device
-  skc_device_enqueue_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS,
-                            cq,k1,2,
-                            0,NULL,NULL);
-
-  // kickstart kernel execution
-  cl(Flush(cq));
-
-  // release kernels
-  cl(ReleaseKernel(k0));
-  cl(ReleaseKernel(k1));
-}
-
-static 
-void
-skc_block_pool_dispose(struct skc_runtime * const runtime)
-{
-  skc_extent_phr_pdrw_free(runtime,&runtime->block_pool.atomics);
-  skc_extent_pdrw_free    (runtime,&runtime->block_pool.ids);
-  skc_extent_pdrw_free    (runtime,&runtime->block_pool.blocks);
-}
-
-//
-//
-//
-
-static
-bool
-skc_runtime_yield(struct skc_runtime * const runtime)
-{
-  return skc_scheduler_yield(runtime->scheduler);
-}
-
-static
-void 
-skc_runtime_wait(struct skc_runtime * const runtime)
-{
-  skc_scheduler_wait(runtime->scheduler);
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[])
-{
-  // allocate the runtime
-  struct skc_runtime * const runtime = malloc(sizeof(*runtime));
-
-  // acquire OpenCL ids and context for target device
-  skc_err err = skc_runtime_cl_create(&runtime->cl,
-                                      target_platform_substring,
-                                      target_device_substring,
-                                      context_properties);
-
-  // create device
-  skc_device_create(runtime);
-
-  // create the host and device allocators
-  skc_allocator_host_create(runtime);
-  skc_allocator_device_create(runtime);
-
-  // how many slots in the scheduler?
-  runtime->scheduler = skc_scheduler_create(runtime,runtime->config->scheduler.size);
-
-  // allocate deps structure
-  runtime->deps      = skc_grid_deps_create(runtime,
-                                            runtime->scheduler,
-                                            runtime->config->block_pool.pool_size);
-
-  // initialize cq pool
-  skc_cq_pool_create(runtime,
-                     &runtime->cq_pool,
-                     runtime->config->cq_pool.type,
-                     runtime->config->cq_pool.size);
-
-  // acquire in-order cq
-  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
-
-  // initialize block pool
-  skc_block_pool_create(runtime,cq);
-
-  // intialize handle pool
-  skc_handle_pool_create(runtime,
-                         &runtime->handle_pool,
-                         runtime->config->handle_pool.size,
-                         runtime->config->handle_pool.width,
-                         runtime->config->handle_pool.recs);
-
-  //
-  // initialize pfns
-  //
-  // FIXME -- at this point we will have identified which device we've
-  // targeted and will load a DLL (or select from a built-in library)
-  // that contains all the pfns.
-  //
-  context->runtime        = runtime;
-
-  context->yield          = skc_runtime_yield;
-  context->wait           = skc_runtime_wait;
-  
-  context->path_builder   = skc_path_builder_cl_12_create;
-  context->path_retain    = skc_runtime_path_host_retain;
-  context->path_release   = skc_runtime_path_host_release;
-  context->path_flush     = skc_runtime_path_host_flush;
-
-  context->raster_builder = skc_raster_builder_cl_12_create;
-  context->raster_retain  = skc_runtime_raster_host_retain;
-  context->raster_release = skc_runtime_raster_host_release;
-  context->raster_flush   = skc_runtime_raster_host_flush;
-
-  context->composition    = skc_composition_cl_12_create;
-  context->styling        = skc_styling_cl_12_create;
-  
-  context->surface        = skc_surface_cl_12_create;
-
-  // block on pool creation
-  cl(Finish(cq));
-
-  // dispose of in-order cq
-  skc_runtime_release_cq_in_order(runtime,cq);
-
-  return err;
-};
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_12_dispose(struct skc_context * const context)
-{
-  //
-  // FIXME -- incomplete
-  //
-  fprintf(stderr,"%s incomplete!\n",__func__);
-
-  struct skc_runtime * runtime = context->runtime;
-
-  skc_allocator_device_dispose(runtime);
-  skc_allocator_host_dispose(runtime);
-
-  skc_scheduler_dispose(context->runtime,context->runtime->scheduler);
-
-  skc_grid_deps_dispose(context->runtime->deps);
-
-  skc_cq_pool_dispose(runtime,&runtime->cq_pool);
-
-  skc_block_pool_dispose(context->runtime);
-
-  // skc_handle_pool_dispose(context->runtime);
-  
-  return SKC_ERR_SUCCESS;
-}
-
-//
-// TEMPORARY BENCHMARK
-//
-
-#if 1
-
-#include <windows.h>
-
-#define SKC_FRAMES_MASK 0x7F
-#define SKC_FRAMES      (SKC_FRAMES_MASK + 1)
-
-void
-skc_runtime_cl_12_debug(struct skc_context * const context)
-{
-#ifdef NDEBUG
-  static skc_uint      frames=0;
-  static LARGE_INTEGER StartingTime={0}, EndingTime;
-
-  if ((frames++ & SKC_FRAMES_MASK) != SKC_FRAMES_MASK)
-    return;
-
-  QueryPerformanceCounter(&EndingTime);
-  
-  LARGE_INTEGER ElapsedMicroseconds, Frequency;
-
-  ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
-
-  QueryPerformanceFrequency(&Frequency);   
-
-  double const msecs_total  = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
-  double const msecs_frame  = msecs_total / SKC_FRAMES;
-
-  printf("Frames / Total / Per : %u / %.3f / %.3f\n",
-         SKC_FRAMES,msecs_total,msecs_frame);
-#endif
-
-  struct skc_runtime * const runtime = context->runtime;
-  
-  // acquire out-of-order cq
-  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
-
-  // copy atomics to host
-  skc_extent_phr_pdrw_read(&runtime->block_pool.atomics,cq,NULL);
-
-  // block until complete
-  cl(Finish(cq));
-
-  // dispose of out-of-order cq
-  skc_runtime_release_cq_in_order(runtime,cq);
-
-  union skc_block_pool_atomic const * const bp_atomic = runtime->block_pool.atomics.hr;
-
-  skc_uint const available = bp_atomic->writes - bp_atomic->reads;
-  skc_uint const inuse     = runtime->config->block_pool.pool_size - available;
-
-  fprintf(stderr,"w/r/f/a: %9u - %9u = %9u : %6.2f MB\n",
-          bp_atomic->writes,
-          bp_atomic->reads,
-          available,
-          (inuse * runtime->config->block.bytes) / (1024.0*1024.0));
-
-  if (available >= (1<<27))
-    {
-      fprintf(stderr,"block pool corrupted!\n");
-      exit(-1);
-    }
-
-  //
-  //
-  //
-#ifdef NDEBUG
-  QueryPerformanceCounter(&StartingTime);
-#endif
-}
-
-#endif
-
-//
-//
-//
-
diff --git a/src/compute/skc/runtime_cl_12.h b/src/compute/skc/runtime_cl_12.h
deleted file mode 100644
index 7e7ffcb284..0000000000
--- a/src/compute/skc/runtime_cl_12.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include "runtime.h"
-#include "runtime_cl.h"
-#include "cq_pool_cl.h"
-#include "handle_pool_cl_12.h"
-#include "block_pool_cl_12.h"
-#include "allocator_device_cl.h"
-
-//
-// FIXME -- two parts:
-//
-// 1. directly access the structures in the runtime sub-struct implementations
-// 2. possibly wall off the non-platform-specific structs into a sub structure
-//
-
-struct skc_runtime
-{
-  //
-  // state visible to device
-  //
-  struct skc_runtime_cl            cl;
-
-  struct {
-    struct skc_allocator_host      host;
-    struct skc_allocator_device    device;
-  } allocator;
-
-  struct skc_cq_pool               cq_pool;
-
-  struct skc_block_pool            block_pool;
-
-  struct skc_handle_pool           handle_pool;
-
-  //
-  // state that is slightly opaque (for now)
-  //
-  struct skc_scheduler           * scheduler;
-
-  struct skc_grid_deps           * deps;
-
-  struct skc_config const        * config; // FIXME: config will be determined by device with some opportunities to resize
-
-  struct skc_device              * device; // opaque bundle of kernels
-};
-
-//
-// Creation and disposal intitializes context and may rely on other
-// context resources like the scheduler
-//
-
-skc_err
-skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[]);
-
-skc_err
-skc_runtime_cl_12_dispose(struct skc_context * const context);
-
-//
-// HOST HANDLE RETAIN/RELEASE/FLUSH
-//
-
-skc_err
-skc_runtime_path_host_retain(struct skc_runtime * const runtime,
-                             skc_path_t   const *       paths,
-                             uint32_t                   count);
-
-skc_err
-skc_runtime_raster_host_retain(struct skc_runtime * const runtime,
-                               skc_raster_t const *       rasters,
-                               uint32_t                   count);
-
-
-skc_err
-skc_runtime_path_host_release(struct skc_runtime * const runtime,
-                              skc_path_t   const *       paths,
-                              uint32_t                   count);
-
-skc_err
-skc_runtime_raster_host_release(struct skc_runtime * const runtime,
-                                skc_raster_t const *       rasters,
-                                uint32_t                   count);
-
-
-skc_err
-skc_runtime_path_host_flush(struct skc_runtime * const runtime,
-                            skc_path_t   const *       paths,
-                            uint32_t                   count);
-
-skc_err
-skc_runtime_raster_host_flush(struct skc_runtime * const runtime,
-                              skc_raster_t const *       rasters,
-                              uint32_t                   count);
-
-//
-// DEVICE/PIPELINE HANDLE ACQUIRE/RETAIN/RELEASE
-//
-// The retain operations pre-validate handles
-//
-
-skc_handle_t
-skc_runtime_handle_device_acquire(struct skc_runtime * const runtime);
-
-skc_err
-skc_runtime_handle_device_validate_retain(struct skc_runtime       * const runtime,
-                                          skc_typed_handle_type_e    const handle_type,
-                                          skc_typed_handle_t const *       typed_handles,
-                                          uint32_t                         count);
-
-void
-skc_runtime_handle_device_retain(struct skc_runtime * const runtime,
-                                 skc_handle_t const *       handles,
-                                 uint32_t                   count);
-
-void
-skc_runtime_path_device_release(struct skc_runtime * const runtime,
-                                skc_handle_t const *       handles,
-                                uint32_t                   count);
-
-void
-skc_runtime_raster_device_release(struct skc_runtime * const runtime,
-                                  skc_handle_t const *       handles,
-                                  uint32_t                   count);
-
-//
-// We only use in-order command queues in the pipeline
-//
-
-cl_command_queue
-skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime);
-
-void
-skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
-                                cl_command_queue           cq);
-
-//
-// DEVICE MEMORY ALLOCATION
-//
-
-cl_mem
-skc_runtime_device_perm_alloc(struct skc_runtime * const runtime,
-                              cl_mem_flags         const flags,
-                              size_t               const size);
-
-void
-skc_runtime_device_perm_free(struct skc_runtime * const runtime,
-                             cl_mem               const mem);
-
-cl_mem
-skc_runtime_device_temp_alloc(struct skc_runtime * const runtime,
-                              cl_mem_flags         const flags,
-                              size_t               const size,
-                              skc_subbuf_id_t    * const subbuf_id,
-                              size_t             * const subbuf_size);
-
-void
-skc_runtime_device_temp_free(struct skc_runtime * const runtime,
-                             cl_mem               const mem,
-                             skc_subbuf_id_t      const subbuf_id);
-
-//
-//
-//
diff --git a/src/compute/skc/segment_ttck.cl b/src/compute/skc/segment_ttck.cl
deleted file mode 100644
index 6ac068bee6..0000000000
--- a/src/compute/skc/segment_ttck.cl
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include <hs/cl/gen9/hs_cl_macros.h>
-
-#include "atomic_cl.h"
-#include "tile.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
-
-//
-//
-//
-
-#define SKC_YX_NEQ(row,prev)                \
-  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
-                        __global uint                     * SKC_RESTRICT const indices,
-                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
-{
-  uint const global_id = get_global_id(0);
-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
-  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
-  //
-  // LOAD ALL THE ROWS
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                                           \
-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
-  HS_SLAB_ROWS();
-
-  //
-  // LOAD LAST REGISTER FROM COLUMN TO LEFT
-  //
-  uint  diffs = 0;
-  uint2 r0    = r1;
-
-  if (gmem_base > 0) {
-    // if this is the first key in any slab but the first then it
-    // broadcast loads the last key in previous slab
-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
-  } else if (get_sub_group_local_id() == 0) {
-    // if this is the first lane in the first slab
-    diffs = 1;
-  }
-
-  // now shuffle in the last key from the column to the left
-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
-  //
-  // FIND ALL DIFFERENCES IN SLAB
-  //
-  uint valid = 0;
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  diffs |= (SKC_YX_NEQ(row,prev) << prev);
-
-  HS_SLAB_ROWS();
-
-  //
-  // SUM UP THE DIFFERENCES
-  //
-  uint const valid_diffs = valid & diffs;
-  uint const count       = popcount(valid_diffs);
-  uint const inclusive   = sub_group_scan_inclusive_add(count);
-  uint const exclusive   = inclusive - count;
-
-  //
-  // RESERVE SPACE IN THE INDICES ARRAY
-  //
-  uint next = 0;
-
-  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
-    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
-
-  // distribute base across subgroup
-  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
-
-  //
-  // STORE THE INDICES
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (valid_diffs & (1 << prev))                \
-    indices[next++] = lane_idx + prev;
-
-  HS_SLAB_ROWS();
-
-  //
-  // TRANSPOSE THE SLAB AND STORE IT
-  //
-  HS_TRANSPOSE_SLAB();
-}
-
-//
-//
-//
diff --git a/src/compute/skc/segment_ttrk.cl b/src/compute/skc/segment_ttrk.cl
deleted file mode 100644
index 28a9557ad7..0000000000
--- a/src/compute/skc/segment_ttrk.cl
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include <hs/cl/gen9/hs_cl_macros.h>
-
-#include "tile.h"
-#include "raster_builder_cl_12.h" // need meta_in structure
-#include "device_cl_12_gen9.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
-
-//
-// THE BEST TYPE TO ZERO SMEM
-//
-
-#define SKC_ZERO_TYPE  ulong
-#define SKC_ZERO_WORDS 2
-
-//
-// THE ORDER OF COMPONENTS IS:
-//
-// 0: blocks
-// 1: offset
-// 2: pk
-// 3: rk
-//
-
-#if (HS_KEYS_PER_SLAB < 256)
-
-#define SKC_META_TYPE       uint
-#define SKC_META_WORDS      1
-
-#define SKC_COMPONENT_TYPE  uchar
-
-#else
-
-#define SKC_META_TYPE       uint2
-#define SKC_META_WORDS      2
-
-#define SKC_COMPONENT_TYPE  ushort
-
-#endif
-
-//
-//
-//
-
-#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
-#define SKC_COHORT_TYPE uchar
-#else
-#define SKC_COHORT_TYPE ushort
-#endif
-
-//
-//
-//
-
-#define SKC_COHORT_ID(row)                      \
-  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
-
-//
-// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
-//
-
-#define SKC_IS_BLOCK(row)                                               \
-  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-
-#define SKC_YX(row,prev)                        \
-  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
-
-#define SKC_IS_PK(row,prev)                             \
-  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
-
-//
-// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
-// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
-//
-// COHORT SIZE >= SUBGROUP SIZE
-//
-
-#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)
-
-#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
-#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
-#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
-
-#define SKC_META_COMPONENTS       4
-#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
-                        __global uint        * SKC_RESTRICT const metas)
-{
-  __local union
-  {
-    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
-    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
-    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
-  } shared;
-
-  uint const global_id = get_global_id(0);
-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
-  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
-  //
-  // LOAD ALL THE ROWS
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                                           \
-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
-  HS_SLAB_ROWS();
-
-  //
-  // LOAD LAST REGISTER FROM COLUMN TO LEFT
-  //
-  uint  diffs = 0;
-  uint2 r0    = 0;
-
-  if (gmem_base > 0) {
-    // if this is the first key in any slab but the first then it
-    // broadcast loads the last key in previous slab
-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
-  } else {
-    // otherwise broadcast the first key in the first slab
-    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
-    // and mark it as an implicit diff
-    if (get_sub_group_local_id() == 0)
-      diffs = 1;
-  }
-
-  // now shuffle in the last key from the column to the left
-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
-  // shift away y/x
-  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
-
-  //
-  // EXTRACT ALL COHORT IDS EARLY...
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                           \
-  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
-
-  HS_SLAB_ROWS();
-
-  //
-  // DEBUG
-  //
-#if 0
-  if (gmem_base == HS_KEYS_PER_SLAB * 7)
-    {
-      if (get_sub_group_local_id() == 0)
-        printf("\n%llX ",as_ulong(r0));
-      else
-        printf("%llX ",as_ulong(r0));
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-      if (get_sub_group_local_id() == 0)        \
-        printf("\n%llX ",r##row);               \
-      else                                      \
-        printf("%llX ",r##row);
-
-      HS_SLAB_ROWS();
-    }
-#endif
-
-  //
-  // CAPTURE ALL CONDITIONS WE CARE ABOUT
-  //
-  // Diffs must be captured before cohorts
-  //
-  uint            valid  = 0;
-  uint            blocks = 0;
-  uint            pks    = 0;
-  SKC_COHORT_TYPE c_max  = 0;
-
-  //
-  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
-  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
-  //
-#if 0
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  diffs |= ((c##row != c##prev) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  blocks |= (SKC_IS_BLOCK(row) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  pks |= SKC_IS_PK(row,prev) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
-  HS_SLAB_ROWS();
-
-#else
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (c##row != c##prev)                        \
-    diffs |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (SKC_IS_BLOCK(row))                        \
-    blocks |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (SKC_IS_PK(row,prev))                      \
-    pks |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (r##row != SKC_ULONG_MAX) {                \
-    valid |= 1<<prev;                           \
-    c_max  = max(c_max,c##row);                 \
-  }
-
-  HS_SLAB_ROWS();
-
-#endif
-
-  //
-  // TRANSPOSE THE SLAB AND STORE IT
-  //
-  HS_TRANSPOSE_SLAB();
-
-  // the min cohort is the first key in the slab
-  uint const c_min = sub_group_broadcast(c1,0);
-  
-  // the max cohort is the max across all lanes
-  c_max = sub_group_reduce_max(c_max);
-
-#if 0 // REMOVE ME LATER
-  if (get_sub_group_local_id() == 0)
-    printf("%3u : ( %3u , %3u )\n",
-           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
-#endif
-
-  //
-  // ZERO SMEM
-  //
-  // zero only the meta info for the cohort ids found in this slab
-  //
-#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
-  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
-  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
-
-  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
-    shared.z[zz] = 0;
-#else
-  // ERROR -- it's highly unlikely that the zero type is smaller than
-  // the meta type
-#error("Unsupported right now...")
-#endif
-
-  //
-  // ACCUMULATE AND STORE META INFO
-  //
-  uint const    valid_blocks = valid & blocks;
-  uint const    valid_pks    = valid & pks & ~diffs;
-  SKC_META_TYPE meta         = ( 0 );
-
-#define SKC_META_LOCAL_ADD(meta)                \
-  atomic_add(shared.m+HS_REG_LAST(c),meta);
-
-#define SKC_META_LOCAL_STORE(meta,prev)         \
-  shared.m[c##prev] = meta;
-
-  // note this is purposefully off by +1
-#define SKC_META_RESET(meta,curr)               \
-  meta = ((gmem_off + curr) << 8);
-
-#if 0
-
-  // FIXME -- this can be tweaked to shift directly
-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
-  meta += ((((blocks >> prev) & 1)      ) |     \
-           (((pks    >> prev) & 1) << 16) |     \
-           (((rks    >> prev) & 1) << 24));
-
-#else
-
-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
-  if (blocks & (1<<prev))                       \
-    meta += 1;                                  \
-  if (pks    & (1<<prev))                       \
-    meta += 1<<16;                              \
-  if (rks    & (1<<prev))                       \
-    meta += 1<<24;
-
-#endif
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (diffs & (1<<prev)) {                      \
-    SKC_META_LOCAL_STORE(meta,prev);            \
-    SKC_META_RESET(meta,row);                   \
-  }                                             \
-  SKC_META_ADD(meta,prev,                       \
-               valid_blocks,                    \
-               valid_pks,                       \
-               valid);
-
-  HS_SLAB_ROWS();
-
-  //
-  // ATOMICALLY ADD THE CARRIED OUT METAS
-  //
-#if 0 // BUG
-  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
-    SKC_META_LOCAL_ADD(meta);
-#else
-  if (meta != 0)
-    SKC_META_LOCAL_ADD(meta);
-#endif
-
-  //
-  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
-  //
-
-  // convert the slab offset to an extent offset
-  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
-  uint const adjust    = is_offset ? gmem_base - 1 : 0;
-
-  //
-  // only process the meta components found in this slab
-  //
-  uint const cc_min = c_min * SKC_META_COMPONENTS;
-  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
-  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
-
-  if ((cc >= cc_min) && (cc <= cc_max))
-    {
-      uint const c = shared.c[cc];
-
-      if (c != 0)
-        atomic_add(metas+cc,c+adjust);
-    }
-
-  cc += HS_LANES_PER_WARP;
-
-  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
-    {
-      uint const c = shared.c[cc];
-
-      if (c != 0)
-        atomic_add(metas+cc,c+adjust);
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/skc/styling_cl_12.c b/src/compute/skc/styling_cl_12.c
deleted file mode 100644
index 6c84fe6f70..0000000000
--- a/src/compute/skc/styling_cl_12.c
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTES:
-//
-// - this particular object only needs a command queue for a short
-//   time so consider acquiring/releasing the command queue on demand
-//   but only if command queues are cached and expensive to keep
-//
-
-#include "common/cl/assert_cl.h"
-
-#include "styling_cl_12.h"
-#include "extent_cl_12.h"
-#include "runtime_cl_12.h"
-
-#include "context.h"
-#include "styling_types.h"
-
-//
-//
-//
-
-static
-void
-skc_styling_unmap_complete(skc_grid_t const grid)
-{
-  struct skc_styling_impl * const impl = skc_grid_get_data(grid);
-  
-  impl->state = SKC_STYLING_STATE_SEALED;
-
-  skc_grid_complete(grid);
-}
-
-static
-void
-skc_styling_unmap_cb(cl_event event, cl_int status, skc_grid_t const grid)
-{
-  SKC_CL_CB(status);
-
-  struct skc_styling_impl * const impl      = skc_grid_get_data(grid);
-  struct skc_scheduler    * const scheduler = impl->runtime->scheduler;
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(scheduler,skc_styling_unmap_complete,grid);
-}
-
-static
-void
-skc_styling_grid_pfn_execute(skc_grid_t const grid)
-{
-  struct skc_styling_impl * const impl    = skc_grid_get_data(grid);
-  struct skc_styling      * const styling = impl->styling;
-
-  //
-  // unmap all extents
-  //
-  cl_event complete;
-
-  skc_extent_phwN_pdrN_unmap(&impl->layers,styling->layers.extent,impl->cq,NULL);
-  skc_extent_phwN_pdrN_unmap(&impl->groups,styling->groups.extent,impl->cq,NULL);
-  skc_extent_phwN_pdrN_unmap(&impl->extras,styling->extras.extent,impl->cq,&complete);  
-
-  // set the event
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unmap_cb,grid));
-  cl(ReleaseEvent(complete));
-
-  // flush command queue
-  cl(Flush(impl->cq));
-}
-
-//
-//
-//
-
-static
-void
-skc_styling_pfn_seal(struct skc_styling_impl * const impl)
-{
-  // return if sealing or sealed
-  if (impl->state >= SKC_STYLING_STATE_SEALING)
-    return;
-
-  struct skc_runtime   * const runtime   = impl->runtime;
-  struct skc_scheduler * const scheduler = runtime->scheduler;
-
-  //
-  // otherwise, wait for UNSEALING > UNSEALED transition
-  //
-  if (impl->state == SKC_STYLING_STATE_UNSEALING)
-    {
-      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
-    }
-  
-  //
-  // we're unsealed so we need to seal and start the grid
-  //
-  impl->state = SKC_STYLING_STATE_SEALING;
-  impl->grid  = SKC_GRID_DEPS_ATTACH(runtime->deps,
-                                     NULL,
-                                     impl,
-                                     NULL,  // no waiting
-                                     skc_styling_grid_pfn_execute,
-                                     NULL); // no dispose
-
-  // no need to force -- styling has no dependencies
-  skc_grid_start(impl->grid);
-}
-
-//
-//
-//
-
-void
-skc_styling_unseal_complete(struct skc_styling_impl * const impl)
-{
-  struct skc_runtime * const runtime = impl->runtime;
-  
-  // we're now unsealed
-  impl->state = SKC_STYLING_STATE_UNSEALED;
-}
-
-static
-void
-skc_styling_unseal_cb(cl_event event, cl_int status, struct skc_styling_impl * const impl)
-{
-  SKC_CL_CB(status);
-  
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(impl->runtime->scheduler,skc_styling_unseal_complete,impl);
-}
-
-static
-void
-skc_styling_pfn_unseal(struct skc_styling_impl * const impl, skc_bool const block)
-{
-  // return if already unsealed
-  if (impl->state == SKC_STYLING_STATE_UNSEALED)
-    return;
-
-  //
-  // otherwise, we're going to need to pump the scheduler
-  //
-  struct skc_runtime   * const runtime   = impl->runtime;
-  struct skc_scheduler * const scheduler = runtime->scheduler;
-
-  //
-  // wait for UNSEALING > UNSEALED transition
-  //
-  if (impl->state == SKC_STYLING_STATE_UNSEALING)
-    {
-      if (block) {
-        SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
-      }
-      return;
-    }
-
-  //
-  // otherwise, wait for SEALING > SEALED transition ...
-  //
-  if (impl->state == SKC_STYLING_STATE_SEALING)
-    {
-      // wait if sealing 
-      SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED);
-    }
-  
-  // wait for rendering locks to be released
-  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0);
-
-  // ... and then unseal the styling object
-  impl->state = SKC_STYLING_STATE_UNSEALING;
-
-  // defensively NULL the grid reference
-  impl->grid  = NULL; // defensive
-
-  // set styling pointers with mapped extents
-  cl_event complete;
-
-  struct skc_styling * const styling = impl->styling;
-  
-  styling->layers.extent = skc_extent_phwN_pdrN_map(&impl->layers,impl->cq,NULL);
-  styling->groups.extent = skc_extent_phwN_pdrN_map(&impl->groups,impl->cq,NULL);
-  styling->extras.extent = skc_extent_phwN_pdrN_map(&impl->extras,impl->cq,&complete);
-
-  cl(SetEventCallback(complete,CL_COMPLETE,skc_styling_unseal_cb,impl));
-  cl(ReleaseEvent(complete));
-
-  // flush it
-  cl(Flush(impl->cq));
-
-  // wait until unsealed...
-  if (block) {
-    SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_UNSEALED);
-  }
-}
-
-//
-//
-//
-
-static
-void
-skc_styling_pfn_release(struct skc_styling_impl * const impl)
-{
-  if (--impl->styling->ref_count != 0)
-    return;
-
-  //
-  // otherwise, unmap all resources by sealing and delete
-  //
-  skc_styling_pfn_seal(impl);
-
-  struct skc_runtime   * const runtime   = impl->runtime;
-  struct skc_scheduler * const scheduler = runtime->scheduler;
-
-  // wait until sealed
-  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->state != SKC_STYLING_STATE_SEALED);
-
-  // wait for locks to drain
-  SKC_SCHEDULER_WAIT_WHILE(scheduler,impl->lock_count > 0)
-
-  //
-  // styling is now disposable
-  //
-
-  // free styling host
-  skc_runtime_host_perm_free(runtime,impl->styling);
-
-  // release the cq
-  skc_runtime_release_cq_in_order(runtime,impl->cq);
-  
-  // free extents
-  skc_extent_phwN_pdrN_free(runtime,&impl->layers);
-  skc_extent_phwN_pdrN_free(runtime,&impl->groups);
-  skc_extent_phwN_pdrN_free(runtime,&impl->extras);
-
-  // free styling impl
-  skc_runtime_host_perm_free(runtime,impl);
-}
-
-//
-//
-//
-
-void
-skc_styling_retain_and_lock(struct skc_styling * const styling)
-{
-  skc_styling_retain(styling);
-
-  styling->impl->lock_count += 1;
-}
-
-void
-skc_styling_unlock_and_release(struct skc_styling * const styling)
-{
-  styling->impl->lock_count -= 1;
-
-  skc_styling_pfn_release(styling->impl);
-}
-
-//
-//
-//
-
-skc_err
-skc_styling_cl_12_create(struct skc_context   * const context,
-                         struct skc_styling * * const styling,
-                         skc_uint               const layers_count,
-                         skc_uint               const groups_count,
-                         skc_uint               const extras_count)
-{
-  // retain the context
-  // skc_context_retain(context);
-
-  // allocate the impl
-  struct skc_runtime      * const runtime = context->runtime;
-  struct skc_styling_impl * const impl    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
-
-  // allocate styling
-  (*styling)          = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**styling));
-  (*styling)->context = context;
-  (*styling)->impl    = impl;
-
-  // intialize impl
-  impl->styling       = (*styling);
-  impl->runtime       = runtime;
-
-  SKC_ASSERT_STATE_INIT(impl,SKC_STYLING_STATE_SEALED);
-
-  impl->lock_count    = 0;
-
-  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
-  
-  //
-  // The styling object is unique in that the API lets the user
-  // specify resource limits
-  //
-  // The styling object is a simple container that can have wildly
-  // varying resource requirements (but still relatively modest).
-  //
-  // Additionally, an advanced SKC programmer may want to create many
-  // styling and composition objects as they're relatively cheap.
-  //
-  skc_extent_phwN_pdrN_alloc(runtime,&impl->layers,sizeof(*(*styling)->layers.extent) * layers_count);
-  skc_extent_phwN_pdrN_alloc(runtime,&impl->groups,sizeof(*(*styling)->groups.extent) * groups_count);
-  skc_extent_phwN_pdrN_alloc(runtime,&impl->extras,sizeof(*(*styling)->extras.extent) * extras_count);
-
-  // initialize styling
-  (*styling)->layers.size  = layers_count;
-  (*styling)->groups.size  = groups_count;
-  (*styling)->extras.size  = extras_count;
-
-  (*styling)->layers.count = 0;
-  (*styling)->groups.count = 0;
-  (*styling)->extras.count = 0;
-
-  // save pfns
-  (*styling)->seal         = skc_styling_pfn_seal;
-  (*styling)->unseal       = skc_styling_pfn_unseal;
-  (*styling)->release      = skc_styling_pfn_release;
-
-  // set ref count
-  (*styling)->ref_count    = 1;
-
-  // map the extents by unsealing
-  skc_styling_pfn_unseal(impl,false);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/styling_cl_12.h b/src/compute/skc/styling_cl_12.h
deleted file mode 100644
index a319568ee5..0000000000
--- a/src/compute/skc/styling_cl_12.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-//
-//
-
-#include <CL/opencl.h>
-
-#include "styling.h"
-#include "grid.h"
-#include "extent_cl_12.h"
-#include "assert_state.h"
-
-//
-// styling states
-//
-
-typedef enum skc_styling_state_e {
-
-  SKC_STYLING_STATE_UNSEALING,
-  SKC_STYLING_STATE_UNSEALED,
-  SKC_STYLING_STATE_SEALING,
-  SKC_STYLING_STATE_SEALED
-
-} skc_styling_state_e;
-
-//
-// IMPL
-//
-
-struct skc_styling_impl
-{
-  struct skc_styling         * styling;
-  struct skc_runtime         * runtime;
-  
-  SKC_ASSERT_STATE_DECLARE(skc_styling_state_e);
-
-  skc_int                      lock_count;  // # of wip renders
-
-  skc_grid_t                   grid;
-
-  // in-order command queue
-  cl_command_queue             cq;
-
-  //
-  // only 3 extents
-  //
-  struct skc_extent_phwN_pdrN  layers;
-  struct skc_extent_phwN_pdrN  groups;
-  struct skc_extent_phwN_pdrN  extras;
-};
-
-//
-// ONLY VISIBLE WITHIN THIS RUNTIME
-//
-
-void
-skc_styling_retain_and_lock(struct skc_styling * const styling);
-
-void
-skc_styling_unlock_and_release(struct skc_styling * const styling);
-
-//
-//
-//
diff --git a/src/compute/skc/surface_cl_12.h b/src/compute/skc/surface_cl_12.h
deleted file mode 100644
index 43ea5428a5..0000000000
--- a/src/compute/skc/surface_cl_12.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#ifndef SKC_SURFACE_CL_12_ONCE
-#define SKC_SURFACE_CL_12_ONCE
-
-//
-// Unlike other object platform implementations, the surface object
-// implementation needs to access the opaque platform-specfic outputs
-// of the composition and styling objects.
-//
-//  Composition : { keys,   offsets, key_count, offset_count }
-//  Styling     : { layers, groups,  commands                }
-//
-// With the OpenCL platform we'll handle this by simply exposing the
-// argument value (void*) and its size (size_t).
-//
-// TODO: It might make sense in the future to support more complex
-//       rendering jobs that simultaneously involve multiple surfaces,
-//       compositions and stylings.
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/skc/surface_cl_12_buffer.c b/src/compute/skc/surface_cl_12_buffer.c
deleted file mode 100644
index cc7cba5225..0000000000
--- a/src/compute/skc/surface_cl_12_buffer.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "common/cl/assert_cl.h"
-
-#include "extent_cl_12.h"
-#include "runtime_cl_12.h"
-#include "styling_cl_12.h"
-#include "composition_cl_12.h"
-
-#include "context.h"
-#include "surface.h"
-
-//
-//
-//
-
-#include <stdio.h>
-
-//
-// BUILD
-//
-
-struct skc_surface_impl
-{
-  struct skc_surface        * surface;
-  struct skc_runtime        * runtime;
-
-  // framebuffer
-  // struct skc_extent_pdrw      fb;
-  // struct skc_extent_phrN_pdwN fb;
-
-  // for now, a single in-order command queue
-  cl_command_queue            cq;
-
-  struct {
-    cl_kernel                 render;
-  } kernels;
-};
-
-//
-// we might want concurrent access to the same surface as long as
-// the clips don't overlap.
-//
-// this would require acquiring a cq on demand when it is determined
-// that the clipped render won't overlap
-//
-// { tile clip , cq } pair
-//
-// skc_uint4                clip;
-// cl_command_queue         cq
-//
-
-struct skc_surface_render
-{
-  skc_uint                      clip[4];
-
-  struct skc_surface_impl     * impl;
-  struct skc_styling          * styling;
-  struct skc_composition      * composition;
-
-  skc_surface_render_pfn_notify notify;
-  void                        * data;
-
-  cl_mem                        fb;
-
-  skc_grid_t                    grid;
-
-  skc_subbuf_id_t               id;
-};
-
-//
-//
-//
-
-static
-void
-skc_surface_pfn_clear(struct skc_surface_impl * const impl,
-                      float                     const rgba[4],
-                      skc_uint                  const rect[4],
-                      void                     *      fb)
-{
-  size_t const origin[3] = { rect[0], rect[1], 0 };
-  size_t const region[3] = { rect[2], rect[3], 1 };
-
-  cl(EnqueueFillImage(impl->cq,
-                      (cl_mem)fb,
-                      rgba,
-                      origin,
-                      region,
-                      0,NULL,NULL));
-}
-
-//
-//
-//
-
-static
-void
-skc_surface_pfn_blit(struct skc_surface_impl * const impl,
-                     skc_uint                  const rect[4],
-                     skc_int                   const txty[2])
-{
-  ;
-}
-
-//
-//
-//
-
-#if 0 // #ifndef NDEBUG
-#define SKC_SURFACE_DEBUG
-#endif
-
-#ifdef SKC_SURFACE_DEBUG
-
-#define SKC_SURFACE_WIDTH  4096
-#define SKC_SURFACE_HEIGHT 4096
-
-static
-void
-skc_surface_debug(struct skc_surface_impl * const impl)
-{
-  //
-  // MAP
-  //
-  cl_uchar4 * const rgba = skc_extent_phrN_pdwN_map(&impl->fb,
-                                                    impl->cq,
-                                                    NULL);
-  cl(Finish(impl->cq));
-
-  //
-  // WRITE
-  //
-  FILE* file;
-
-  errno_t ferr = fopen_s(&file,"surface.ppm","wb");
-
-  fprintf(file,"P6\n%u %u\n255\n",SKC_SURFACE_WIDTH,SKC_SURFACE_HEIGHT);
-
-  for (skc_uint ii=0; ii<SKC_SURFACE_HEIGHT*SKC_SURFACE_WIDTH; ii++)
-    fwrite(rgba + ii,sizeof(skc_uchar),3,file); // R,G,B
-
-  ferr = fclose(file);
-
-  //
-  // UNMAP
-  //
-  skc_extent_phrN_pdwN_unmap(&impl->fb,rgba,impl->cq,NULL);
-
-  cl(Flush(impl->cq));
-}
-
-#endif
-
-//
-//
-//
-
-void
-skc_surface_render_complete(struct skc_surface_render * const render)
-{
-#ifdef SKC_SURFACE_DEBUG
-  // write fb out
-  skc_surface_debug(render->impl);
-#endif
-
-  // notify
-  if (render->notify != NULL) {
-    render->notify(render->impl->surface,
-                   render->styling,
-                   render->composition,
-                   render->data);
-  }
-
-  // unlock and release the styling and composition
-  skc_styling_unlock_and_release(render->styling);
-  skc_composition_unlock_and_release(render->composition);
-
-  // grid is now complete
-  skc_grid_complete(render->grid);
-}
-
-static
-void
-skc_surface_render_cb(cl_event event, cl_int status, struct skc_surface_render * const render)
-{
-  SKC_CL_CB(status);
-
-  // as quickly as possible, enqueue next stage in pipeline to context command scheduler
-  SKC_SCHEDULER_SCHEDULE(render->impl->runtime->scheduler,
-                         skc_surface_render_complete,
-                         render);
-}
-
-//
-//
-//
-
-static
-void
-skc_surface_grid_pfn_execute(skc_grid_t const grid)
-{
-  struct skc_surface_render   * const render  = skc_grid_get_data(grid);
-  struct skc_surface_impl     * const impl    = render->impl;
-  struct skc_runtime          * const runtime = impl->runtime;
-
-  // get the composition args
-  struct skc_composition_impl * const ci      = render->composition->impl;
-  struct skc_place_atomics    * const atomics = ci->atomics.hr;
-
-  if (atomics->offsets > 0)
-    {
-      // acquire the rbo
-      cl(EnqueueAcquireGLObjects(impl->cq,1,&render->fb,0,NULL,NULL));
-
-      // get the styling args
-      struct skc_styling_impl * const si = render->styling->impl;
-
-      cl(SetKernelArg(impl->kernels.render,0,SKC_CL_ARG(si->layers.drN)));
-      cl(SetKernelArg(impl->kernels.render,1,SKC_CL_ARG(si->groups.drN)));
-      cl(SetKernelArg(impl->kernels.render,2,SKC_CL_ARG(si->extras.drN)));
-
-      cl(SetKernelArg(impl->kernels.render,3,SKC_CL_ARG(ci->keys.drw)));
-      cl(SetKernelArg(impl->kernels.render,4,SKC_CL_ARG(atomics->keys)));
-      cl(SetKernelArg(impl->kernels.render,5,SKC_CL_ARG(ci->offsets.drw)));
-      cl(SetKernelArg(impl->kernels.render,6,SKC_CL_ARG(atomics->offsets)));
-
-      // block pool
-      cl(SetKernelArg(impl->kernels.render,7,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
-
-      // surface
-      cl(SetKernelArg(impl->kernels.render,8,SKC_CL_ARG(render->fb)));
-
-#if 1
-      // tile clip
-      cl(SetKernelArg(impl->kernels.render,9,sizeof(skc_uint4),render->clip));
-#else
-      // surface pitch (height)
-      skc_uint const surface_pitch = SKC_SURFACE_HEIGHT;
-      cl(SetKernelArg(impl->kernels.render,9,SKC_CL_ARG(surface_pitch)));
-      // tile clip
-      cl(SetKernelArg(impl->kernels.render,10,sizeof(skc_uint4),render->clip));
-#endif
-
-      // launch render kernel
-      skc_device_enqueue_kernel(runtime->device,
-                                SKC_DEVICE_KERNEL_ID_RENDER,
-                                impl->cq,
-                                impl->kernels.render,
-                                atomics->offsets,
-                                0,NULL,NULL);
-
-
-      cl_event complete;
-
-      // give the rbo back
-      cl(EnqueueReleaseGLObjects(impl->cq,1,&render->fb,0,NULL,&complete));
-
-      // notify anyone listening...
-      cl(SetEventCallback(complete,CL_COMPLETE,skc_surface_render_cb,render));
-      cl(ReleaseEvent(complete));
-
-      // flush it
-      cl(Flush(impl->cq));
-    }
-  else
-    {
-      skc_surface_render_complete(render);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_surface_pfn_release(struct skc_surface_impl * const impl)
-{
-  if (--impl->surface->ref_count != 0)
-    return;
-
-  //
-  // otherwise, release all resources
-  //
-  
-  // drain the command queue
-  cl(Finish(impl->cq));
-
-  struct skc_runtime * const runtime = impl->runtime;  
-
-  // release the kernel
-  cl(ReleaseKernel(impl->kernels.render));
-
-  // free surface host
-  skc_runtime_host_perm_free(runtime,impl->surface);
-
-  // release the cq
-  skc_runtime_release_cq_in_order(runtime,impl->cq);
-
-  // release fb
-  // skc_extent_phrN_pdwN_free(runtime,&impl->fb);
-  
-  // free surface impl
-  skc_runtime_host_perm_free(runtime,impl);
-}
-
-//
-//
-//
-
-static
-void
-skc_surface_grid_pfn_dispose(skc_grid_t const grid)
-{
-  struct skc_surface_render * const render  = skc_grid_get_data(grid);
-  struct skc_surface_impl   * const impl    = render->impl;
-  struct skc_runtime        * const runtime = impl->runtime;
-
-  // free the render object
-  skc_runtime_host_temp_free(runtime,render,render->id);
-
-  // release the surface
-  skc_surface_pfn_release(impl);
-}
-
-//
-//
-//
-
-static
-void
-skc_surface_pfn_render(struct skc_surface_impl * const impl,
-                       uint32_t                  const clip[4],
-                       skc_styling_t                   styling,
-                       skc_composition_t               composition,
-                       skc_surface_render_pfn_notify   notify,
-                       void                          * data,
-                       void                          * fb)
-{
-  // retain surface
-  skc_surface_retain(impl->surface);
-
-  //
-  // FIXME -- we used to seal the styling and composition objects if
-  // they weren't already.  Either test that they're sealed or seal
-  // them here.
-  //
-
-  // retain and lock the styling and composition 
-  skc_styling_retain_and_lock(styling);
-  skc_composition_retain_and_lock(composition);
-
-  //
-  // allocate a render instance
-  //
-  skc_subbuf_id_t                   id;
-  struct skc_surface_render * const render = skc_runtime_host_temp_alloc(impl->runtime,
-                                                                         SKC_MEM_FLAGS_READ_WRITE,
-                                                                         sizeof(*render),&id,NULL);
-  render->id          = id;
-
-  render->clip[0]     = clip[0];
-  render->clip[1]     = clip[1];
-  render->clip[2]     = clip[2];
-  render->clip[3]     = clip[3];
-
-  render->impl        = impl;
-  render->styling     = styling;
-  render->composition = composition;
-
-  render->notify      = notify;
-  render->data        = data;
-
-  render->fb          = fb;
-
-  render->grid        = SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
-                                             NULL, // invalidation not necessary
-                                             render,
-                                             NULL, // no waiting
-                                             skc_surface_grid_pfn_execute,
-                                             skc_surface_grid_pfn_dispose);
-
-  // declare happens-after relationships
-  skc_grid_happens_after_grid(render->grid,styling->impl->grid);
-  skc_grid_happens_after_grid(render->grid,composition->impl->grids.sort);
-
-  // wait for styling and composition
-  skc_grid_start(render->grid);
-}
-
-//
-//
-//
-
-skc_err
-skc_surface_cl_12_create(struct skc_context   * const context,
-                         struct skc_surface * * const surface)
-{
-  struct skc_runtime * const runtime = context->runtime;
-
-  // allocate surface
-  (*surface) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**surface));
-
-  // allocate impl
-  struct skc_surface_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
-
-  // initialize surface
-  // SKC_ASSERT_STATE_INIT((*impl),SKC_SURFACE_STATE_READY);
-
-  (*surface)->context    = context;
-  (*surface)->impl       = impl;
-  (*surface)->ref_count  = 1;
-
-  (*surface)->release    = skc_surface_pfn_release;
-  (*surface)->clear      = skc_surface_pfn_clear;
-  (*surface)->blit       = skc_surface_pfn_blit;
-  (*surface)->render     = skc_surface_pfn_render;
-
-  // intialize impl
-  impl->surface          = *surface;
-  impl->runtime          = runtime;
-
-#if 0
-  // FIXME -- 4K x 4K -- temporarily fixed size
-  size_t const fb_size = sizeof(skc_uchar4) * SKC_SURFACE_WIDTH * SKC_SURFACE_HEIGHT;
-
-  // create framebuffer
-  skc_extent_phrN_pdwN_alloc(runtime,&impl->fb,fb_size);
-#endif
-
-  // acquire a command queue
-  impl->cq               = skc_runtime_acquire_cq_in_order(runtime);
-
-  // acquire kernel
-  impl->kernels.render   = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_RENDER);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
diff --git a/src/compute/skc/types.h b/src/compute/skc/types.h
index 6d6d19aba2..655cea0ad4 100644
--- a/src/compute/skc/types.h
+++ b/src/compute/skc/types.h
@@ -38,12 +38,6 @@
 //
 //
 
-#include <stdbool.h>
-
-//
-//
-//
-
 #define SKC_TYPE_HELPER(t)  skc_##t
 #define SKC_TYPE(t)         SKC_TYPE_HELPER(t)
 
@@ -114,16 +108,16 @@ typedef cl_float16  skc_float16;
 
 typedef cl_half     skc_half;
 
-#if defined( __CL_HALF2__)
+#if defined(__CL_HALF2__)
 typedef cl_half2    skc_half2;
 #endif
-#if defined( __CL_HALF4__)
+#if defined(__CL_HALF4__)
 typedef cl_half4    skc_half4;
 #endif
-#if defined( __CL_HALF8__)
+#if defined(__CL_HALF8__)
 typedef cl_half8    skc_half8;
 #endif
-#if defined( __CL_HALF16__)
+#if defined(__CL_HALF16__)
 typedef cl_half16   skc_half16;
 #endif
 
@@ -206,16 +200,16 @@ typedef float16     skc_float16;
 
 typedef half        skc_half;
 
-#if defined( __CL_HALF2__)
+#if defined(__CL_HALF2__)
 typedef half2       skc_half2;
 #endif
-#if defined( __CL_HALF4__)
+#if defined(__CL_HALF4__)
 typedef half4       skc_half4;
 #endif
-#if defined( __CL_HALF8__)
+#if defined(__CL_HALF8__)
 typedef half8       skc_half8;
 #endif
-#if defined( __CL_HALF16__)
+#if defined(__CL_HALF16__)
 typedef half16      skc_half16;
 #endif
 
@@ -243,12 +237,6 @@ typedef half16      skc_half16;
 //
 //
 
-#endif
-
-//
-//
-//
-
 #define SKC_UCHAR_MAX    0xFF
 
 #define SKC_SHORT_MAX    0x7FFF
@@ -265,3 +253,9 @@ typedef half16      skc_half16;
 //
 //
 
+#endif
+
+//
+//
+//
+
-- 
cgit v1.2.3