4 files changed, 48 insertions, 71 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index b3b87982af..0243940a10 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -139,24 +139,15 @@ static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
     st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static Sk4f clamp(const Sk4f& x) {
-    return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
-}
-
 static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
     auto ptr = st->ctx<uint32_t*>() + x;
 
-    r = clamp(sk_linear_to_srgb(r));
-    g = clamp(sk_linear_to_srgb(g));
-    b = clamp(sk_linear_to_srgb(b));
-    a = clamp(         255.0f * a );
-
-    ( SkNx_cast<int>(r)
-    | SkNx_cast<int>(g) << 8
-    | SkNx_cast<int>(b) << 16
-    | SkNx_cast<int>(a) << 24 ).store(ptr);
+    ( sk_linear_to_srgb(r)
+    | sk_linear_to_srgb(g) << 8
+    | sk_linear_to_srgb(b) << 16
+    | Sk4f_round(255.0f*a) << 24).store(ptr);
 }
 
 static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
@@ -164,9 +155,8 @@ static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
                                           Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
     auto ptr = st->ctx<uint32_t*>() + x;
 
-    auto rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
-    rgba = {rgba[0], rgba[1], rgba[2], 255.0f*a[0]};
-    rgba = clamp(rgba);
+    Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
+    rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)};
 
     SkNx_cast<uint8_t>(rgba).store(ptr);
 }
diff --git a/src/core/SkSRGB.h b/src/core/SkSRGB.h
index d567a962d8..d3baa74631 100644
--- a/src/core/SkSRGB.h
+++ b/src/core/SkSRGB.h
@@ -14,37 +14,33 @@
  *
  *  Current best practices:
  *      - for sRGB -> linear, lookup R,G,B in sk_linear_from_srgb;
- *      - for linear -> sRGB, call sk_linear_to_srgb() for R,G,B, and round;
+ *      - for linear -> sRGB, call sk_linear_to_srgb() for R,G,B;
  *      - the alpha channel is linear in both formats, needing at most *(1/255.0f) or *255.0f.
  *
- *  sk_linear_to_srgb()'s output requires rounding; it does not round for you.
- *
- *  Given inputs in [0,1], sk_linear_to_srgb() will not underflow 0 but may overflow 255.
- *  The overflow is small enough to be handled by rounding.
- *  (But if you don't trust the inputs are in [0,1], you'd better clamp both sides immediately.)
- *
  *  sk_linear_to_srgb() will run a little faster than usual when compiled with SSE4.1+.
  */
 
 extern const float sk_linear_from_srgb[256];
 
-static inline Sk4f sk_linear_to_srgb(const Sk4f& x) {
+static inline Sk4i sk_linear_to_srgb(const Sk4f& x) {
     // Approximation of the sRGB gamma curve (within 1 when scaled to 8-bit pixels).
-    // For 0.00000f <= x <  0.00349f,    12.92 * x
-    // For 0.00349f <= x <= 1.00000f,    0.679*(x.^0.5) + 0.423*x.^(0.25) - 0.101
-    // Note that 0.00349 was selected because it is a point where both functions produce the
-    // same pixel value when rounded.
+    //
+    // Tuned by brute force to minimize the number of bytes that fail to round trip,
+    // here 0 (of 256), and then to minimize the number of points halfway between bytes
+    // (in linear space) that fail to hit the right byte, here 131 (of 255), and to
+    // minimize the number of monotonicity regressions over the range [0,1], here 0.
+
     auto rsqrt = x.rsqrt(),
          sqrt  = rsqrt.invert(),
          ftrt  = rsqrt.rsqrt();
 
-    auto lo = (12.92f * 255.0f) * x;
+    auto lo = (13.0471f * 255.0f) * x;
 
-    auto hi = (-0.101115084998961f * 255.0f) +
-              (+0.678513029959381f * 255.0f) * sqrt +
-              (+0.422602055039580f * 255.0f) * ftrt;
+    auto hi = (-0.0974983f * 255.0f)
+            + (+0.687999f  * 255.0f) * sqrt
+            + (+0.412999f  * 255.0f) * ftrt;
 
-    return (x < 0.00349f).thenElse(lo, hi);
+    return SkNx_cast<int>( (x < 0.0048f).thenElse(lo, hi) );
 }
 
 #endif//SkSRGB_DEFINED
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
index 3bb11f5599..af683e105f 100644
--- a/src/opts/SkColorXform_opts.h
+++ b/src/opts/SkColorXform_opts.h
@@ -16,20 +16,20 @@
 
 namespace SK_OPTS_NS {
 
-static Sk4f linear_to_2dot2(const Sk4f& x) {
+static Sk4f clamp_0_1(const Sk4f& x) {
+    // The order of the arguments is important here.  We want to make sure that NaN
+    // clamps to zero.  Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
+    return Sk4f::Min(Sk4f::Max(x, 0.0f), 1.0f);
+}
+
+static Sk4i linear_to_2dot2(const Sk4f& x) {
     // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
     auto x2  = x.rsqrt(),                            // x^(-1/2)
          x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(),   // x^(-1/32)
          x64 = x32.rsqrt();                          // x^(+1/64)
 
     // 29 = 32 - 2 - 1
-    return 255.0f * x2.invert() * x32 * x64.invert();
-}
-
-static Sk4f clamp_0_to_255(const Sk4f& x) {
-    // The order of the arguments is important here.  We want to make sure that NaN
-    // clamps to zero.  Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
-    return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
+    return Sk4f_round(255.0f * x2.invert() * x32 * x64.invert());
 }
 
 enum DstGamma {
@@ -79,21 +79,18 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len,
 
         auto store_4 = [&dstReds, &dstGreens, &dstBlues, &dst, &dstTables] {
             if (kSRGB_DstGamma == kDstGamma || k2Dot2_DstGamma == kDstGamma) {
-                Sk4f (*linear_to_curve)(const Sk4f&) =
+                Sk4i (*linear_to_curve)(const Sk4f&) =
                         (kSRGB_DstGamma == kDstGamma) ? sk_linear_to_srgb : linear_to_2dot2;
 
-                dstReds   = linear_to_curve(dstReds);
-                dstGreens = linear_to_curve(dstGreens);
-                dstBlues  = linear_to_curve(dstBlues);
+                auto reds   = linear_to_curve(clamp_0_1(dstReds));
+                auto greens = linear_to_curve(clamp_0_1(dstGreens));
+                auto blues  = linear_to_curve(clamp_0_1(dstBlues));
 
-                dstReds   = clamp_0_to_255(dstReds);
-                dstGreens = clamp_0_to_255(dstGreens);
-                dstBlues  = clamp_0_to_255(dstBlues);
 
-                auto rgba = (Sk4f_round(dstReds)   << SK_R32_SHIFT)
-                          | (Sk4f_round(dstGreens) << SK_G32_SHIFT)
-                          | (Sk4f_round(dstBlues)  << SK_B32_SHIFT)
-                          | (Sk4i{      0xFF       << SK_A32_SHIFT});
+                auto rgba = (reds       << SK_R32_SHIFT)
+                          | (greens     << SK_G32_SHIFT)
+                          | (blues      << SK_B32_SHIFT)
+                          | (Sk4i{0xFF} << SK_A32_SHIFT);
                 rgba.store((uint32_t*) dst);
 
                 dst = SkTAddOffset<void>(dst, 4 * sizeof(uint32_t));
@@ -155,15 +152,13 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len,
         auto dstPixel = rXgXbX*r + rYgYbY*g + rZgZbZ*b;
 
         if (kSRGB_DstGamma == kDstGamma || k2Dot2_DstGamma == kDstGamma) {
-            Sk4f (*linear_to_curve)(const Sk4f&) =
+            Sk4i (*linear_to_curve)(const Sk4f&) =
                     (kSRGB_DstGamma == kDstGamma) ? sk_linear_to_srgb : linear_to_2dot2;
 
-            dstPixel = linear_to_curve(dstPixel);
-
-            dstPixel = clamp_0_to_255(dstPixel);
+            auto pixel = linear_to_curve(clamp_0_1(dstPixel));
 
             uint32_t rgba;
-            SkNx_cast<uint8_t>(Sk4f_round(dstPixel)).store(&rgba);
+            SkNx_cast<uint8_t>(pixel).store(&rgba);
             rgba |= 0xFF000000;
             *((uint32_t*) dst) = SkSwizzle_RGBA_to_PMColor(rgba);
             dst = SkTAddOffset<void>(dst, sizeof(uint32_t));
diff --git a/tests/SRGBTest.cpp b/tests/SRGBTest.cpp
index 65bfc59b42..43ec02700f 100644
--- a/tests/SRGBTest.cpp
+++ b/tests/SRGBTest.cpp
@@ -11,28 +11,24 @@
 #include <math.h>
 
 static uint8_t linear_to_srgb(float l) {
-    // Round float to int, truncate that to uint8_t.
-    return (uint8_t)Sk4f_round( sk_linear_to_srgb(Sk4f{l}) )[0];
+    return (uint8_t)sk_linear_to_srgb(Sk4f{l})[0];
 }
 
 DEF_TEST(sk_linear_to_srgb, r) {
-    // Should map 0 -> 0 and 1 -> 1.
-    REPORTER_ASSERT(r,   0 == linear_to_srgb(0.0f));
-    REPORTER_ASSERT(r, 255 == linear_to_srgb(1.0f));
+    // All bytes should round trip.
+    for (int i = 0; i < 256; i++) {
+        int actual = linear_to_srgb(sk_linear_from_srgb[i]);
+        if (i != actual) {
+            ERRORF(r, "%d -> %d\n", i, actual);
+        }
+    }
 
     // Should be monotonic between 0 and 1.
-    // We don't bother checking denorm values.
-    int tolerated_regressions = 0;
-#if defined(SK_ARM_HAS_NEON)
-    // Values around 0.166016 are usually 72 but drop briefly (41 floats) down to 71.
-    tolerated_regressions = 1;
-#endif
     uint8_t prev = 0;
-    for (float f = FLT_MIN; f <= 1.0f; ) {
+    for (float f = FLT_MIN; f <= 1.0f; ) {  // We don't bother checking denorm values.
         uint8_t srgb = linear_to_srgb(f);
 
-        REPORTER_ASSERT(r, srgb >= prev || tolerated_regressions > 0);
-        if (srgb < prev) { tolerated_regressions--; }
+        REPORTER_ASSERT(r, srgb >= prev);
         prev = srgb;
 
         union { float flt; uint32_t bits; } pun = { f };