From eea7c16d59a163bd5b858891bc916c49063ed8ac Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Thu, 3 Nov 2016 10:20:35 -0400
Subject: Add Matrix colorfilter pipeline stages.

This breaks the color filter down into a couple logical steps:
  - go to unpremul
  - apply the 4x5 matrix
  - clamp to [0,1]
  - go to premul

Because we already have handy premul clamp stages, we swap the order of clamp and premul.  This is lossless.

While adding our stages to the pipeline, we analyze the matrix to see if we can skip any steps:
  - we can skip unpremul if the shader is opaque (alphas are all 1 ~~~> we're already unpremul);
  - we can skip the premul back if the color filter always produces opaque (here, are the inputs opaque and do we keep them that way, but we could also check for an explicit 0 0 0 0 1 alpha row);
  - we can skip the clamp_0 if the matrix can never produce a value less than 0;
  - we can skip the clamp_1 if the matrix can never produce a value greater than 1.

The only thing that should seem missing is per-pixel alpha checks.  We don't do those here, but instead make up for it by operating on 4-8 pixels at a time.
We don't split the 4x5 matrix into a 4x4 and 1x4 translate.  We could, but when we have FMA (new x86, all ARMv8) we might as well work the translate for free into the FMAs.

This makes gm/fadefilter.cpp draw differently in sRGB and F16 modes, bringing them in line with the GPU sRGB and GPU f16 configs.  It's unclear to me what was wrong with the old CPU implementation.

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=4346
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: I14082ded8fb8d63354167d9e6b3f8058f840253e
Reviewed-on: https://skia-review.googlesource.com/4346
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 src/opts/SkRasterPipeline_opts.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'src/opts/SkRasterPipeline_opts.h')
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 845fe40e41..9f15391918 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -193,22 +193,32 @@ SI SkNh to_565(const SkNf& r, const SkNf& g, const SkNf& b) {
 
 STAGE(just_return, false) { }
 
-/*  We don't seem to have a need for this yet.
 STAGE(clamp_0, true) {
     a = SkNf::Max(a, 0.0f);
     r = SkNf::Max(r, 0.0f);
     g = SkNf::Max(g, 0.0f);
     b = SkNf::Max(b, 0.0f);
 }
-*/
 
-STAGE(clamp_1, true) {
+STAGE(clamp_a, true) {
     a = SkNf::Min(a, 1.0f);
     r = SkNf::Min(r, a);
     g = SkNf::Min(g, a);
     b = SkNf::Min(b, a);
 }
 
+STAGE(unpremul, true) {
+    r *= a.invert();
+    g *= a.invert();
+    b *= a.invert();
+}
+
+STAGE(premul, true) {
+    r *= a;
+    g *= a;
+    b *= a;
+}
+
 STAGE(swap_src_dst, true) {
     SkTSwap(r,dr);
     SkTSwap(g,dg);
@@ -450,6 +460,19 @@ STAGE(luminance_to_alpha, true) {
     r = g = b = 0;
 }
 
+STAGE(matrix_4x5, true) {
+    auto m = (const float*)ctx;
+
+    auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); };
+    auto R = fma(r,m[0], fma(g,m[4], fma(b,m[ 8], fma(a,m[12], m[16])))),
+         G = fma(r,m[1], fma(g,m[5], fma(b,m[ 9], fma(a,m[13], m[17])))),
+         B = fma(r,m[2], fma(g,m[6], fma(b,m[10], fma(a,m[14], m[18])))),
+         A = fma(r,m[3], fma(g,m[7], fma(b,m[11], fma(a,m[15], m[19]))));
+    r = R;
+    g = G;
+    b = B;
+    a = A;
+}
 
 template <typename Fn>
 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {
-- 
cgit v1.2.3