9 files changed, 100 insertions, 417 deletions
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp
index 37542e3525..540fdb7815 100644
--- a/bench/PMFloatBench.cpp
+++ b/bench/PMFloatBench.cpp
@@ -20,20 +20,10 @@ static uint32_t lcg_rand(uint32_t* seed) {
 }
 
 // I'm having better luck getting these to constant-propagate away as template parameters.
-template <bool kClamp, bool kWide>
-struct PMFloatGetSetBench : public Benchmark {
-    PMFloatGetSetBench() {}
+struct PMFloatRoundtripBench : public Benchmark {
+    PMFloatRoundtripBench() {}
 
-    const char* onGetName() override {
-        switch (kClamp << 1 | kWide) {
-            case 0: return "SkPMFloat_get_1x";
-            case 1: return "SkPMFloat_get_4x";
-            case 2: return "SkPMFloat_clamp_1x";
-            case 3: return "SkPMFloat_clamp_4x";
-        }
-        SkFAIL("unreachable");
-        return "oh bother";
-    }
+    const char* onGetName() override { return "SkPMFloat_roundtrip"; }
     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
 
     void onDraw(const int loops, SkCanvas* canvas) override {
@@ -41,61 +31,23 @@ struct PMFloatGetSetBench : public Benchmark {
         uint32_t junk = 0;
         uint32_t seed = 0;
         for (int i = 0; i < loops; i++) {
-            SkPMColor colors[4];
+            SkPMColor color;
         #ifdef SK_DEBUG
-            for (int i = 0; i < 4; i++) {
-                // Our SkASSERTs will remind us that it's technically required that we premultiply.
-                colors[i] = SkPreMultiplyColor(lcg_rand(&seed));
-            }
+            // Our SkASSERTs will remind us that it's technically required that we premultiply.
+            color = SkPreMultiplyColor(lcg_rand(&seed));
         #else
             // But it's a lot faster not to, and this code won't really mind the non-PM colors.
-            (void)lcg_rand(&seed);
-            colors[0] = seed + 0;
-            colors[1] = seed + 1;
-            colors[2] = seed + 2;
-            colors[3] = seed + 3;
+            color = lcg_rand(&seed);
         #endif
 
-            SkPMFloat fa,fb,fc,fd;
-            if (kWide) {
-                SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
-            } else {
-                fa = SkPMFloat::FromPMColor(colors[0]);
-                fb = SkPMFloat::FromPMColor(colors[1]);
-                fc = SkPMFloat::FromPMColor(colors[2]);
-                fd = SkPMFloat::FromPMColor(colors[3]);
-            }
-
-            SkPMColor back[4];
-            switch (kClamp << 1 | kWide) {
-                case 0: {
-                    back[0] = fa.round();
-                    back[1] = fb.round();
-                    back[2] = fc.round();
-                    back[3] = fd.round();
-                } break;
-                case 1: SkPMFloat::RoundTo4PMColors(fa, fb, fc, fd, back); break;
-                case 2: {
-                    back[0] = fa.roundClamp();
-                    back[1] = fb.roundClamp();
-                    back[2] = fc.roundClamp();
-                    back[3] = fd.roundClamp();
-                } break;
-                case 3: SkPMFloat::RoundClampTo4PMColors(fa, fb, fc, fd, back); break;
-            }
-            for (int i = 0; i < 4; i++) {
-                junk ^= back[i];
-            }
+            auto f = SkPMFloat::FromPMColor(color);
+            SkPMColor back = f.round();
+            junk ^= back;
         }
         blackhole ^= junk;
     }
 };
-
-// Extra () help DEF_BENCH not get confused by the comma inside the <>.
-DEF_BENCH(return (new PMFloatGetSetBench< true,  true>);)
-DEF_BENCH(return (new PMFloatGetSetBench<false,  true>);)
-DEF_BENCH(return (new PMFloatGetSetBench< true, false>);)
-DEF_BENCH(return (new PMFloatGetSetBench<false, false>);)
+DEF_BENCH(return new PMFloatRoundtripBench;)
 
 struct PMFloatGradientBench : public Benchmark {
     const char* onGetName() override { return "PMFloat_gradient"; }
@@ -103,8 +55,8 @@ struct PMFloatGradientBench : public Benchmark {
 
     SkPMColor fDevice[100];
     void onDraw(const int loops, SkCanvas*) override {
-        Sk4f c0 = SkPMFloat::FromARGB(255, 255, 0, 0),
-             c1 = SkPMFloat::FromARGB(255, 0, 0, 255),
+        Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0),
+             c1 = SkPMFloat::FromARGB(1, 0, 0, 1),
              dc = c1 - c0,
              fx(0.1f),
              dx(0.002f),
@@ -112,15 +64,15 @@ struct PMFloatGradientBench : public Benchmark {
              dcdx4(dcdx+dcdx+dcdx+dcdx);
 
         for (int n = 0; n < loops; n++) {
-            Sk4f a = c0 + dc*fx + Sk4f(0.5f),  // The +0.5f lets us call trunc() instead of get().
+            Sk4f a = c0 + dc*fx,
                  b = a + dcdx,
                  c = b + dcdx,
                  d = c + dcdx;
             for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
-                fDevice[i+0] = SkPMFloat(a).trunc();
-                fDevice[i+1] = SkPMFloat(b).trunc();
-                fDevice[i+2] = SkPMFloat(c).trunc();
-                fDevice[i+3] = SkPMFloat(d).trunc();
+                fDevice[i+0] = SkPMFloat(a).round();
+                fDevice[i+1] = SkPMFloat(b).round();
+                fDevice[i+2] = SkPMFloat(c).round();
+                fDevice[i+3] = SkPMFloat(d).round();
                 a = a + dcdx4;
                 b = b + dcdx4;
                 c = c + dcdx4;
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index ce7b75e705..f1d302458d 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -21,15 +21,12 @@
 namespace {
 
 // A pre-multiplied color storing each component in the same order as SkPMColor,
-// but as a float in the range [0, 255].
+// but as a float in the range [0, 1].
 class SkPMFloat : public Sk4f {
 public:
     static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
     static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
 
-    // May be more efficient than one at a time.  No special alignment assumed for SkPMColors.
-    static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
-
     // Uninitialized.
     SkPMFloat() {}
     explicit SkPMFloat(SkPMColor);
@@ -47,24 +44,10 @@ public:
     float g() const { return this->kth<SK_G32_SHIFT / 8>(); }
     float b() const { return this->kth<SK_B32_SHIFT / 8>(); }
 
-    // N.B. All methods returning an SkPMColor call SkPMColorAssert on that result before returning.
-
-    // round() and roundClamp() round component values to the nearest integer.
-    SkPMColor round() const;  // Assumes all values in [0, 255].  Some implementations may clamp.
-    SkPMColor roundClamp() const;  // Will clamp all values to [0, 255].
-
-    // Like round(), but truncates instead of rounding.
-    // The domain of this function is (-1.0f, 256.0f).  Values in (-1.0f, 0.0f] trunc to a zero.
-    SkPMColor trunc() const;
-
-    // 4-at-a-time versions of round() and roundClamp(). Like From4PMColors(), no alignment assumed.
-    static void RoundTo4PMColors(
-            const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
-    static void RoundClampTo4PMColors(
-            const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
+    SkPMColor round() const;  // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range.
 
     bool isValid() const {
-        return this->a() >= 0 && this->a() <= 255
+        return this->a() >= 0 && this->a() <= 1
             && this->r() >= 0 && this->r() <= this->a()
             && this->g() >= 0 && this->g() <= this->a()
             && this->b() >= 0 && this->b() <= this->a();
@@ -80,10 +63,8 @@ private:
     // Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON.  _none is generic.
     #include "../opts/SkPMFloat_none.h"
 #else
-    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-        #include "../opts/SkPMFloat_SSSE3.h"
-    #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-        #include "../opts/SkPMFloat_SSE2.h"
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+        #include "../opts/SkPMFloat_sse.h"
     #elif defined(SK_ARM_HAS_NEON)
         #include "../opts/SkPMFloat_neon.h"
     #else
diff --git a/src/effects/SkColorMatrixFilter.cpp b/src/effects/SkColorMatrixFilter.cpp
index 894a7b089d..78d4797e44 100644
--- a/src/effects/SkColorMatrixFilter.cpp
+++ b/src/effects/SkColorMatrixFilter.cpp
@@ -239,25 +239,9 @@ uint32_t SkColorMatrixFilter::getFlags() const {
     return this->INHERITED::getFlags() | fFlags;
 }
 
-/**
- *  Need inv255 = 1 / 255 as a constant, so when we premul a SkPMFloat, we can do this
- *
- *      new_red = old_red * alpha * inv255
- *
- *  instead of (much slower)
- *
- *      new_red = old_red * alpha / 255
- *
- *  However, 1.0f/255 comes to (in hex) 0x3B808081, which is slightly bigger than the "actual"
- *  value of 0x3B808080(repeat 80)... This slightly too-big value can cause us to compute
- *  new_red > alpha, which is a problem (for valid premul). To fix this, we use a
- *  hand-computed value of 0x3B808080, 1 ULP smaller. This keeps our colors valid.
- */
-static const float gInv255 = 0.0039215683f; //  (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
-
 static Sk4f premul(const Sk4f& x) {
-    float scale = SkPMFloat(x).a() * gInv255;
-    Sk4f pm = x * Sk4f(scale, scale, scale, 1);
+    float scale = SkPMFloat(x).a();
+    Sk4f pm = x * SkPMFloat(1, scale, scale, scale);
 
 #ifdef SK_DEBUG
     SkPMFloat pmf(pm);
@@ -268,12 +252,12 @@ static Sk4f premul(const Sk4f& x) {
 }
 
 static Sk4f unpremul(const SkPMFloat& pm) {
-    float scale = 255 / pm.a(); // candidate for fast/approx invert?
-    return pm * Sk4f(scale, scale, scale, 1);
+    float scale = 1 / pm.a(); // candidate for fast/approx invert?
+    return pm * SkPMFloat(1, scale, scale, scale);
 }
 
-static Sk4f clamp_0_255(const Sk4f& value) {
-    return Sk4f::Max(Sk4f::Min(value, Sk4f(255)), Sk4f(0));
+static Sk4f clamp_0_1(const Sk4f& value) {
+    return Sk4f::Max(Sk4f::Min(value, Sk4f(1)), Sk4f(0));
 }
 
 void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const {
@@ -292,14 +276,16 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
 #endif
 
     if (use_floats) {
+        // c0-c3 are already in [0,1].
         const Sk4f c0 = Sk4f::Load(fTranspose + 0);
         const Sk4f c1 = Sk4f::Load(fTranspose + 4);
         const Sk4f c2 = Sk4f::Load(fTranspose + 8);
         const Sk4f c3 = Sk4f::Load(fTranspose + 12);
-        const Sk4f c4 = Sk4f::Load(fTranspose + 16);  // translates
+        // c4 (the translate vector) is in [0, 255].  Bring it back to [0,1].
+        const Sk4f c4 = Sk4f::Load(fTranspose + 16)*Sk4f(1.0f/255);
 
         // todo: we could cache this in the constructor...
-        SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_255(c4))).roundClamp();
+        SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_1(c4))).round();
 
         for (int i = 0; i < count; i++) {
             const SkPMColor src_c = src[i];
@@ -323,7 +309,7 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
             Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;
 
             // clamp, re-premul, and write
-            dst[i] = SkPMFloat(premul(clamp_0_255(dst4))).round();
+            dst[i] = SkPMFloat(premul(clamp_0_1(dst4))).round();
         }
     } else {
         const State& state = fState;
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
deleted file mode 100644
index c7e791ff62..0000000000
--- a/src/opts/SkPMFloat_SSE2.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// _mm_packus_epi16() gives us clamping for free while narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-    __m128i fix8    = _mm_set_epi32(0,0,0,c),
-            fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
-            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-    fVec = _mm_cvtepi32_ps(fix8_32);
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    return this->roundClamp();  // Haven't beaten this yet.
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
-    // Basically, same as roundClamp(), but no rounding.
-    __m128i fix8_32 = _mm_cvttps_epi32(fVec),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    // Haven't beaten this yet.
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Haven't beaten this yet.
-    RoundClampTo4PMColors(a,b,c,d, colors);
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
-    __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
-                                     _mm_packus_epi16(c2, c3));
-    _mm_storeu_si128((__m128i*)colors, c3210);
-    SkPMColorAssert(colors[0]);
-    SkPMColorAssert(colors[1]);
-    SkPMColorAssert(colors[2]);
-    SkPMColorAssert(colors[3]);
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
deleted file mode 100644
index 67116ec2dd..0000000000
--- a/src/opts/SkPMFloat_SSSE3.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
-// (fix8_32), then convert those to floats.
-
-// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit.
-
-// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with
-// _mm_packus_epi16() both clamping and narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-    const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8    = _mm_set_epi32(0,0,0,c),
-            fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
-    fVec = _mm_cvtepi32_ps(fix8_32);
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
-    const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8_32 = _mm_cvttps_epi32(fVec),
-            fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    // Haven't beaten this yet.
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Haven't beaten this yet.  Still faster than RoundClampTo4PMColors?
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
-    __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
-                                     _mm_packus_epi16(c2, c3));
-    _mm_storeu_si128((__m128i*)colors, c3210);
-    SkPMColorAssert(colors[0]);
-    SkPMColorAssert(colors[1]);
-    SkPMColorAssert(colors[2]);
-    SkPMColorAssert(colors[3]);
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index cabb29a2a8..57f613751d 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -7,70 +7,24 @@
 
 namespace { // See SkPMFloat.h
 
-// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn.
-
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
     SkPMColorAssert(c);
     uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
     uint16x8_t  fix8_16 = vmovl_u8(fix8);
     uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-    fVec = vcvtq_f32_u32(fix8_32);
+    fVec = vcvtq_n_f32_u32(fix8_32, 8);
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::trunc() const {
-    uint32x4_t  fix8_32  = vcvtq_u32_f32(fVec);  // vcvtq_u32_f32 truncates
-    uint16x4_t  fix8_16  = vmovn_u32(fix8_32);
-    uint8x8_t   fix8     = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
-    SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
-    SkPMColorAssert(c);
-    return c;
-}
-
 inline SkPMColor SkPMFloat::round() const {
-    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f));
-    uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
-    uint16x4_t  fix8_16  = vqmovn_u32(fix8_32);
-    uint8x8_t   fix8     = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
+    // vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting.
+    float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255));
+    uint32x4_t  fix8_32 = vcvtq_n_u32_f32(rounded, 8);
+    uint16x4_t  fix8_16 = vqmovn_u32(fix8_32);
+    uint8x8_t   fix8    = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
     SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
     SkPMColorAssert(c);
     return c;
 }
 
-// TODO: we should be able to beat these loops on all three methods.
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.roundClamp();
-    colors[1] = b.roundClamp();
-    colors[2] = c.roundClamp();
-    colors[3] = d.roundClamp();
-}
-
 }  // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index 9bb584ed86..ba773564d6 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -8,61 +8,26 @@
 namespace {  // See SkPMFloat.h
 
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    *this = SkPMFloat::FromARGB(SkGetPackedA32(c),
-                                SkGetPackedR32(c),
-                                SkGetPackedG32(c),
-                                SkGetPackedB32(c));
+    float inv255 = 1.0f/255;
+    *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
+                                SkGetPackedR32(c) * inv255,
+                                SkGetPackedG32(c) * inv255,
+                                SkGetPackedB32(c) * inv255);
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::trunc() const {
-    return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
-}
-
 inline SkPMColor SkPMFloat::round() const {
-    SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
     float a = this->a(),
           r = this->r(),
           g = this->g(),
           b = this->b();
-    a = a < 0 ? 0 : (a > 255 ? 255 : a);
-    r = r < 0 ? 0 : (r > 255 ? 255 : r);
-    g = g < 0 ? 0 : (g > 255 ? 255 : g);
-    b = b < 0 ? 0 : (b > 255 ? 255 : b);
-    SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
+    a = a < 0 ? 0 : (a > 1 ? 1 : a);
+    r = r < 0 ? 0 : (r > 1 ? 1 : r);
+    g = g < 0 ? 0 : (g > 1 ? 1 : g);
+    b = b < 0 ? 0 : (b > 1 ? 1 : b);
+    SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
     SkPMColorAssert(c);
     return c;
 }
 
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.roundClamp();
-    colors[1] = b.roundClamp();
-    colors[2] = c.roundClamp();
-    colors[3] = d.roundClamp();
-}
-
 }  // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
new file mode 100644
index 0000000000..802b17ba0c
--- /dev/null
+++ b/src/opts/SkPMFloat_sse.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+namespace {  // See SkPMFloat.h
+
+inline SkPMFloat::SkPMFloat(SkPMColor c) {
+    SkPMColorAssert(c);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    const int _ = 255;  // Zero these bytes.
+    __m128i fix8    = _mm_cvtsi32_si128((int)c),
+            fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
+#else
+    __m128i fix8    = _mm_cvtsi32_si128((int)c),
+            fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
+            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
+#endif
+    fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
+    SkASSERT(this->isValid());
+}
+
+inline SkPMColor SkPMFloat::round() const {
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
+    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
+            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
+            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
+    SkPMColor c = _mm_cvtsi128_si32(fix8);
+    SkPMColorAssert(c);
+    return c;
+}
+
+}  // namespace
diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp
index 1c53081571..b7b39411d1 100644
--- a/tests/PMFloatTest.cpp
+++ b/tests/PMFloatTest.cpp
@@ -12,47 +12,27 @@ DEF_TEST(SkPMFloat, r) {
     // Test SkPMColor <-> SkPMFloat
     SkPMColor c = SkPreMultiplyColor(0xFFCC9933);
     SkPMFloat pmf(c);
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, pmf.a()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, pmf.r()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, pmf.g()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*pmf.a()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, 255*pmf.r()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*pmf.g()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, 255*pmf.b()));
     REPORTER_ASSERT(r, c == pmf.round());
 
     // Test rounding.
-    pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
+    pmf = SkPMFloat(254.5f/255, 203.5f/255, 153.1f/255, 50.8f/255);
     REPORTER_ASSERT(r, c == pmf.round());
 
-    pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f);
-    REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc());
-
-    // Test clamping.
-    SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).roundClamp());
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, clamped.r()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(  1.0f, clamped.g()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(  0.0f, clamped.b()));
+    SkPMFloat clamped(SkPMFloat(510.0f/255, 153.0f/255, 1.0f/255, -0.2f/255).round());
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*clamped.a()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*clamped.r()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(  1.0f, 255*clamped.g()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(  0.0f, 255*clamped.b()));
 
     // Test SkPMFloat <-> Sk4f conversion.
     Sk4f fs = clamped;
     SkPMFloat scaled = fs * Sk4f(0.25f);
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, scaled.a()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, scaled.r()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, scaled.g()));
-    REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, scaled.b()));
-
-    // Test 4-at-a-time conversions.
-    SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
-    SkPMFloat floats[4];
-    SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
-
-    SkPMColor back[4];
-    SkPMFloat::RoundTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
-    for (int i = 0; i < 4; i++) {
-        REPORTER_ASSERT(r, back[i] == colors[i]);
-    }
-
-    SkPMFloat::RoundClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
-    for (int i = 0; i < 4; i++) {
-        REPORTER_ASSERT(r, back[i] == colors[i]);
-    }
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, 255*scaled.a()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, 255*scaled.r()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, 255*scaled.g()));
+    REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, 255*scaled.b()));
 }