7 files changed, 531 insertions, 327 deletions
diff --git a/src/gpu/ccpr/GrCCPRCoverageOp.cpp b/src/gpu/ccpr/GrCCPRCoverageOp.cpp
index c63b494268..a923726713 100644
--- a/src/gpu/ccpr/GrCCPRCoverageOp.cpp
+++ b/src/gpu/ccpr/GrCCPRCoverageOp.cpp
@@ -314,13 +314,13 @@ bool GrCCPRCoverageOpsBuilder::finalize(GrOnFlushResourceProvider* onFlushRP,
                 currFan.push_back(ptsIdx += 2);
                 continue;
 
-            case GrCCPRGeometry::Verb::kConvexSerpentineTo:
+            case GrCCPRGeometry::Verb::kMonotonicSerpentineTo:
                 SkASSERT(!currFan.empty());
                 curveInstanceData[currIndices->fSerpentines++] = {ptsIdx, packedAtlasOffset};
                 currFan.push_back(ptsIdx += 3);
                 continue;
 
-            case GrCCPRGeometry::Verb::kConvexLoopTo:
+            case GrCCPRGeometry::Verb::kMonotonicLoopTo:
                 SkASSERT(!currFan.empty());
                 curveInstanceData[currIndices->fLoops++] = {ptsIdx, packedAtlasOffset};
                 currFan.push_back(ptsIdx += 3);
@@ -410,13 +410,13 @@ void GrCCPRCoverageOp::onExecute(GrOpFlushState* flushState) {
 
     // Cubics.
     auto constexpr kCubicsGrPrimitiveType = GrCCPRCoverageProcessor::kCubicsGrPrimitiveType;
-    this->drawMaskPrimitives(flushState, pipeline, Mode::kSerpentineInsets,
+    this->drawMaskPrimitives(flushState, pipeline, Mode::kSerpentineHulls,
                              kCubicsGrPrimitiveType, 4, &PrimitiveTallies::fSerpentines);
-    this->drawMaskPrimitives(flushState, pipeline, Mode::kLoopInsets,
+    this->drawMaskPrimitives(flushState, pipeline, Mode::kLoopHulls,
                              kCubicsGrPrimitiveType, 4, &PrimitiveTallies::fLoops);
-    this->drawMaskPrimitives(flushState, pipeline, Mode::kSerpentineBorders,
+    this->drawMaskPrimitives(flushState, pipeline, Mode::kSerpentineCorners,
                              kCubicsGrPrimitiveType, 4, &PrimitiveTallies::fSerpentines);
-    this->drawMaskPrimitives(flushState, pipeline, Mode::kLoopBorders,
+    this->drawMaskPrimitives(flushState, pipeline, Mode::kLoopCorners,
                              kCubicsGrPrimitiveType, 4, &PrimitiveTallies::fLoops);
 }
 
diff --git a/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp b/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
index 69ec6ef0d1..69605095f6 100644
--- a/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
+++ b/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
@@ -30,14 +30,14 @@ const char* GrCCPRCoverageProcessor::GetProcessorName(Mode mode) {
             return "GrCCPRQuadraticHullProcessor";
         case Mode::kQuadraticCorners:
             return "GrCCPRQuadraticCornerProcessor";
-        case Mode::kSerpentineInsets:
-            return "GrCCPRCubicInsetProcessor (serpentine)";
-        case Mode::kSerpentineBorders:
-            return "GrCCPRCubicBorderProcessor (serpentine)";
-        case Mode::kLoopInsets:
-            return "GrCCPRCubicInsetProcessor (loop)";
-        case Mode::kLoopBorders:
-            return "GrCCPRCubicBorderProcessor (loop)";
+        case Mode::kSerpentineHulls:
+            return "GrCCPRCubicHullProcessor (serpentine)";
+        case Mode::kLoopHulls:
+            return "GrCCPRCubicHullProcessor (loop)";
+        case Mode::kSerpentineCorners:
+            return "GrCCPRCubicCornerProcessor (serpentine)";
+        case Mode::kLoopCorners:
+            return "GrCCPRCubicCornerProcessor (loop)";
     }
     SK_ABORT("Unexpected ccpr coverage processor mode.");
     return nullptr;
@@ -76,14 +76,14 @@ GrGLSLPrimitiveProcessor* GrCCPRCoverageProcessor::createGLSLInstance(const GrSh
             return new GrCCPRQuadraticHullProcessor();
         case Mode::kQuadraticCorners:
             return new GrCCPRQuadraticCornerProcessor();
-        case Mode::kSerpentineInsets:
-            return new GrCCPRCubicInsetProcessor(GrCCPRCubicProcessor::Type::kSerpentine);
-        case Mode::kSerpentineBorders:
-            return new GrCCPRCubicBorderProcessor(GrCCPRCubicProcessor::Type::kSerpentine);
-        case Mode::kLoopInsets:
-            return new GrCCPRCubicInsetProcessor(GrCCPRCubicProcessor::Type::kLoop);
-        case Mode::kLoopBorders:
-            return new GrCCPRCubicBorderProcessor(GrCCPRCubicProcessor::Type::kLoop);
+        case Mode::kSerpentineHulls:
+            return new GrCCPRCubicHullProcessor(GrCCPRCubicProcessor::CubicType::kSerpentine);
+        case Mode::kLoopHulls:
+            return new GrCCPRCubicHullProcessor(GrCCPRCubicProcessor::CubicType::kLoop);
+        case Mode::kSerpentineCorners:
+            return new GrCCPRCubicCornerProcessor(GrCCPRCubicProcessor::CubicType::kSerpentine);
+        case Mode::kLoopCorners:
+            return new GrCCPRCubicCornerProcessor(GrCCPRCubicProcessor::CubicType::kLoop);
     }
     SK_ABORT("Unexpected ccpr coverage processor mode.");
     return nullptr;
@@ -169,12 +169,13 @@ void PrimitiveProcessor::emitGeometryShader(const GrCCPRCoverageProcessor& proc,
 
 int PrimitiveProcessor::emitHullGeometry(GrGLSLGeometryBuilder* g, const char* emitVertexFn,
                                          const char* polygonPts, int numSides,
-                                         const char* wedgeIdx, const char* insetPts) const {
+                                         const char* wedgeIdx, const char* midpoint) const {
     SkASSERT(numSides >= 3);
 
-    if (!insetPts) {
-        g->codeAppendf("highp float2 centroidpt = %s * float%i(%f);",
+    if (!midpoint) {
+        g->codeAppendf("highp float2 midpoint = %s * float%i(%f);",
                        polygonPts, numSides, 1.0 / numSides);
+        midpoint = "midpoint";
     }
 
     g->codeAppendf("int previdx = (%s + %i) %% %i, "
@@ -222,15 +223,8 @@ int PrimitiveProcessor::emitHullGeometry(GrGLSLGeometryBuilder* g, const char* e
 
     // Emit one third of what is the convex hull of pixel-size boxes centered on the vertices.
     // Each invocation emits a different third.
-    if (insetPts) {
-        g->codeAppendf("%s(%s[rightidx], 1);", emitVertexFn, insetPts);
-    }
     g->codeAppendf("%s(right + bloat * dr, 1);", emitVertexFn);
-    if (insetPts) {
-        g->codeAppendf("%s(%s[%s], 1);", emitVertexFn, insetPts, wedgeIdx);
-    } else {
-        g->codeAppendf("%s(centroidpt, 1);", emitVertexFn);
-    }
+    g->codeAppendf("%s(%s, 1);", emitVertexFn, midpoint);
     g->codeAppendf("%s(self + bloat * %s, 1);", emitVertexFn, dr2);
     g->codeAppend ("if (any(dnotequal)) {");
     g->codeAppendf(    "%s(self + bloat * dl, 1);", emitVertexFn);
@@ -240,7 +234,7 @@ int PrimitiveProcessor::emitHullGeometry(GrGLSLGeometryBuilder* g, const char* e
     g->codeAppend ("}");
     g->codeAppend ("EndPrimitive();");
 
-    return insetPts ? 6 : 5;
+    return 5;
 }
 
 int PrimitiveProcessor::emitEdgeGeometry(GrGLSLGeometryBuilder* g, const char* emitVertexFn,
diff --git a/src/gpu/ccpr/GrCCPRCoverageProcessor.h b/src/gpu/ccpr/GrCCPRCoverageProcessor.h
index d0b20cf686..2835cc5a5f 100644
--- a/src/gpu/ccpr/GrCCPRCoverageProcessor.h
+++ b/src/gpu/ccpr/GrCCPRCoverageProcessor.h
@@ -68,10 +68,10 @@ public:
         kQuadraticCorners,
 
         // Cubics.
-        kSerpentineInsets,
-        kSerpentineBorders,
-        kLoopInsets,
-        kLoopBorders
+        kSerpentineHulls,
+        kLoopHulls,
+        kSerpentineCorners,
+        kLoopCorners
     };
     static constexpr GrVertexAttribType InstanceArrayFormat(Mode mode) {
         return mode < Mode::kQuadraticHulls ? kVec4i_GrVertexAttribType : kVec2i_GrVertexAttribType;
@@ -92,9 +92,8 @@ public:
     void getGLSLProcessorKey(const GrShaderCaps&, GrProcessorKeyBuilder*) const override;
     GrGLSLPrimitiveProcessor* createGLSLInstance(const GrShaderCaps&) const override;
 
-#ifdef SK_DEBUG
     static constexpr float kDebugBloat = 50;
-
+#ifdef SK_DEBUG
     // Increases the 1/2 pixel AA bloat by a factor of kDebugBloat and outputs color instead of
     // coverage (coverage=+1 -> green, coverage=0 -> black, coverage=-1 -> red).
     void enableDebugVisualizations() { fDebugVisualizations = true; }
@@ -188,14 +187,11 @@ protected:
     // Logically, the conservative raster hull is equivalent to the convex hull of pixel-size boxes
     // centered on the vertices.
     //
-    // If an optional inset polygon is provided, then this emits a border from the inset to the
-    // hull, rather than the entire hull.
-    //
     // Geometry shader must be configured to output triangle strips.
     //
     // Returns the maximum number of vertices that will be emitted.
     int emitHullGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn, const char* polygonPts,
-                         int numSides, const char* wedgeIdx, const char* insetPts = nullptr) const;
+                         int numSides, const char* wedgeIdx, const char* midpoint = nullptr) const;
 
     // Emits the conservative raster of an edge (i.e. convex hull of two pixel-size boxes centered
     // on the endpoints). Coverage is -1 on the outside border of the edge geometry and 0 on the
diff --git a/src/gpu/ccpr/GrCCPRCubicProcessor.cpp b/src/gpu/ccpr/GrCCPRCubicProcessor.cpp
index ad0729bca1..0ac4517d5f 100644
--- a/src/gpu/ccpr/GrCCPRCubicProcessor.cpp
+++ b/src/gpu/ccpr/GrCCPRCubicProcessor.cpp
@@ -16,56 +16,10 @@ void GrCCPRCubicProcessor::onEmitVertexShader(const GrCCPRCoverageProcessor& pro
                                               const TexelBufferHandle& pointsBuffer,
                                               const char* atlasOffset, const char* rtAdjust,
                                               GrGPArgs* gpArgs) const {
-    float inset = 1 - kAABloatRadius;
-#ifdef SK_DEBUG
-    if (proc.debugVisualizations()) {
-        inset *= GrCCPRCoverageProcessor::kDebugBloat;
-    }
-#endif
-
-    // Fetch all 4 cubic bezier points.
-    v->codeAppendf("int4 indices = int4(%s.x, %s.x + 1, %s.x + 2, %s.x + 3);",
-                   proc.instanceAttrib(), proc.instanceAttrib(), proc.instanceAttrib(),
-                   proc.instanceAttrib());
-    v->codeAppend ("highp float4x2 bezierpts = float4x2(");
-    v->appendTexelFetch(pointsBuffer, "indices[sk_VertexID]");
-    v->codeAppend (".xy, ");
-    v->appendTexelFetch(pointsBuffer, "indices[(sk_VertexID + 1) % 4]");
-    v->codeAppend (".xy, ");
-    v->appendTexelFetch(pointsBuffer, "indices[(sk_VertexID + 2) % 4]");
-    v->codeAppend (".xy, ");
-    v->appendTexelFetch(pointsBuffer, "indices[(sk_VertexID + 3) % 4]");
-    v->codeAppend (".xy);");
-
-    // Find the corner of the inset geometry that corresponds to this bezier vertex (bezierpts[0]).
-    v->codeAppend ("highp float2x2 N = float2x2(bezierpts[3].y - bezierpts[0].y, "
-                                               "bezierpts[0].x - bezierpts[3].x, "
-                                               "bezierpts[1].y - bezierpts[0].y, "
-                                               "bezierpts[0].x - bezierpts[1].x);");
-    v->codeAppend ("highp float2x2 P = float2x2(bezierpts[3], bezierpts[1]);");
-    v->codeAppend ("if (abs(determinant(N)) < 2) {"); // Area of [pts[3], pts[0], pts[1]] < 1px.
-                       // The inset corner doesn't exist because we are effectively colinear with
-                       // both neighbor vertices. Just duplicate a neighbor's inset corner.
-    v->codeAppend (    "int smallidx = (dot(N[0], N[0]) > dot(N[1], N[1])) ? 1 : 0;");
-    v->codeAppend (    "N[smallidx] = float2(bezierpts[2].y - bezierpts[3 - smallidx * 2].y, "
-                                            "bezierpts[3 - smallidx * 2].x - bezierpts[2].x);");
-    v->codeAppend (    "P[smallidx] = bezierpts[2];");
-    v->codeAppend ("}");
-    v->codeAppend ("N[0] *= sign(dot(N[0], P[1] - P[0]));");
-    v->codeAppend ("N[1] *= sign(dot(N[1], P[0] - P[1]));");
-
-    v->codeAppendf("highp float2 K = float2(dot(N[0], P[0] + %f * sign(N[0])), "
-                                           "dot(N[1], P[1] + %f * sign(N[1])));", inset, inset);
-    v->codeAppendf("%s.xy = K * inverse(N) + %s;", fInset.vsOut(), atlasOffset);
-    v->codeAppendf("%s.xy = %s.xy * %s.xz + %s.yw;",
-                   fInset.vsOut(), fInset.vsOut(), rtAdjust, rtAdjust);
-
-    // The z component tells the gemetry shader how "sharp" this corner is.
-    v->codeAppendf("%s.z = determinant(N) * sign(%s.x) * sign(%s.z);",
-                   fInset.vsOut(), rtAdjust, rtAdjust);
-
-    // Emit the vertex position.
-    v->codeAppendf("highp float2 self = bezierpts[0] + %s;", atlasOffset);
+    v->codeAppend ("highp float2 self = ");
+    v->appendTexelFetch(pointsBuffer,
+                        SkStringPrintf("%s.x + sk_VertexID", proc.instanceAttrib()).c_str());
+    v->codeAppendf(".xy + %s;", atlasOffset);
     gpArgs->fPositionVar.set(kVec2f_GrSLType, "self");
 }
 
@@ -93,63 +47,13 @@ void GrCCPRCubicProcessor::onEmitGeometryShader(GrGLSLGeometryBuilder* g, const
                                                 const char* wind, const char* rtAdjust) const {
     // Prepend bezierpts at the start of the shader.
     g->codePrependf("highp float4x2 bezierpts = float4x2(sk_in[0].gl_Position.xy, "
-                                                    "sk_in[1].gl_Position.xy, "
-                                                    "sk_in[2].gl_Position.xy, "
-                                                    "sk_in[3].gl_Position.xy);");
+                                                        "sk_in[1].gl_Position.xy, "
+                                                        "sk_in[2].gl_Position.xy, "
+                                                        "sk_in[3].gl_Position.xy);");
 
-    // Evaluate the cubic at t=.5 for an approximate midpoint.
+    // Evaluate the cubic at T=.5 for an mid-ish point.
     g->codeAppendf("highp float2 midpoint = bezierpts * float4(.125, .375, .375, .125);");
 
-    // Finish finding the inset geometry we started in the vertex shader. The z component tells us
-    // how "sharp" an inset corner is. And the vertex shader already skips one corner if it is
-    // colinear with its neighbors. So at this point, if a corner is flat, it means the inset
-    // geometry is all empty (it should never be non-convex because the curve gets chopped into
-    // convex segments ahead of time).
-    g->codeAppendf("bool isempty = "
-                       "any(lessThan(float4(%s[0].z, %s[1].z, %s[2].z, %s[3].z) * %s, float4(2)));",
-                   fInset.gsIn(), fInset.gsIn(), fInset.gsIn(), fInset.gsIn(), wind);
-    g->codeAppendf("highp float2 inset[4];");
-    g->codeAppend ("for (int i = 0; i < 4; ++i) {");
-    g->codeAppendf(    "inset[i] = isempty ? midpoint : %s[i].xy;", fInset.gsIn());
-    g->codeAppend ("}");
-
-    // We determine crossover and/or degeneracy by how many inset edges run the opposite direction
-    // of their corresponding bezier edge. If there is one backwards edge, the inset geometry is
-    // actually triangle with a vertex at the crossover point. If there are >1 backwards edges, the
-    // inset geometry doesn't exist (i.e. the bezier quadrilateral isn't large enough) and we
-    // degenerate to the midpoint.
-    g->codeAppend ("lowp float backwards[4];");
-    g->codeAppend ("lowp int numbackwards = 0;");
-    g->codeAppend ("for (int i = 0; i < 4; ++i) {");
-    g->codeAppend (    "lowp int j = (i + 1) % 4;");
-    g->codeAppendf(    "highp float2 inner = inset[j] - inset[i];");
-    g->codeAppendf(    "highp float2 outer = sk_in[j].gl_Position.xy - sk_in[i].gl_Position.xy;");
-    g->codeAppendf(    "backwards[i] = sign(dot(outer, inner));");
-    g->codeAppendf(    "numbackwards += backwards[i] < 0 ? 1 : 0;");
-    g->codeAppend ("}");
-
-    // Find the crossover point. If there actually isn't one, this math is meaningless and will get
-    // dropped on the floor later.
-    g->codeAppend ("lowp int x = (backwards[0] != backwards[2]) ? 1 : 0;");
-    g->codeAppend ("lowp int x3 = (x + 3) % 4;");
-    g->codeAppend ("highp float2x2 X = float2x2(inset[x].y - inset[x+1].y, "
-                                               "inset[x+1].x - inset[x].x, "
-                                               "inset[x+2].y - inset[x3].y, "
-                                               "inset[x3].x - inset[x+2].x);");
-    g->codeAppend ("highp float2 KK = float2(dot(X[0], inset[x]), dot(X[1], inset[x+2]));");
-    g->codeAppend ("highp float2 crossoverpoint = KK * inverse(X);");
-
-    // Determine what point backwards edges should collapse into. If there is one backwards edge,
-    // it should collapse to the crossover point. If >1, they should all collapse to the midpoint.
-    g->codeAppend ("highp float2 collapsepoint = numbackwards == 1 ? crossoverpoint : midpoint;");
-
-    // Collapse backwards egdes to the "collapse" point.
-    g->codeAppend ("for (int i = 0; i < 4; ++i) {");
-    g->codeAppend (    "if (backwards[i] < 0) {");
-    g->codeAppend (        "inset[i] = inset[(i + 1) % 4] = collapsepoint;");
-    g->codeAppend (    "}");
-    g->codeAppend ("}");
-
     // Find the cubic's power basis coefficients.
     g->codeAppend ("highp float2x4 C = float4x4(-1,  3, -3,  1, "
                                                " 3, -6,  3,  0, "
@@ -166,7 +70,7 @@ void GrCCPRCubicProcessor::onEmitGeometryShader(GrGLSLGeometryBuilder* g, const
     g->codeAppend ("highp float4 K, L, M;");
     g->codeAppend ("highp float2 l, m;");
     g->codeAppend ("highp float discr = 3*D2*D2 - 4*D1*D3;");
-    if (Type::kSerpentine == fType) {
+    if (CubicType::kSerpentine == fCubicType) {
         // This math also works out for the "cusp" and "cusp at infinity" cases.
         g->codeAppend ("highp float q = 3*D2 + sign(D2) * sqrt(max(3*discr, 0));");
         g->codeAppend ("l.ts = normalize(float2(q, 6*D1));");
@@ -206,119 +110,105 @@ void GrCCPRCubicProcessor::onEmitGeometryShader(GrGLSLGeometryBuilder* g, const
     g->codeAppendf("%s[2] = %s[2].xy * %s.xz;",
                    fKLMDerivatives.c_str(), fKLMMatrix.c_str(), rtAdjust);
 
+    // Determine the amount of additional coverage to subtract out for the flat edge (P3 -> P0).
+    g->declareGlobal(fEdgeDistanceEquation);
+    g->codeAppendf("int edgeidx0 = %s > 0 ? 3 : 0;", wind);
+    g->codeAppendf("highp float2 edgept0 = bezierpts[edgeidx0];");
+    g->codeAppendf("highp float2 edgept1 = bezierpts[3 - edgeidx0];");
+    this->emitEdgeDistanceEquation(g, "edgept0", "edgept1", fEdgeDistanceEquation.c_str());
+
     this->emitCubicGeometry(g, emitVertexFn, wind, rtAdjust);
 }
 
-void GrCCPRCubicInsetProcessor::emitCubicGeometry(GrGLSLGeometryBuilder* g,
-                                                  const char* emitVertexFn, const char* wind,
-                                                  const char* rtAdjust) const {
+void GrCCPRCubicProcessor::emitPerVertexGeometryCode(SkString* fnBody, const char* position,
+                                                     const char* /*coverage*/,
+                                                     const char* /*wind*/) const {
+    fnBody->appendf("highp float3 klm = float3(%s, 1) * %s;", position, fKLMMatrix.c_str());
+    fnBody->appendf("highp float d = dot(float3(%s, 1), %s);",
+                    position, fEdgeDistanceEquation.c_str());
+    fnBody->appendf("%s = float4(klm, d);", fKLMD.gsOut());
+    this->onEmitPerVertexGeometryCode(fnBody);
+}
+
+void GrCCPRCubicHullProcessor::emitCubicGeometry(GrGLSLGeometryBuilder* g, const char* emitVertexFn,
+                                                 const char* wind, const char* rtAdjust) const {
     // FIXME: we should clip this geometry at the tip of the curve.
-    g->codeAppendf("%s(inset[0], 1);", emitVertexFn);
-    g->codeAppendf("%s(inset[1], 1);", emitVertexFn);
-    g->codeAppendf("%s(inset[3], 1);", emitVertexFn);
-    g->codeAppendf("%s(inset[2], 1);", emitVertexFn);
-    g->codeAppend ("EndPrimitive();");
+    int maxVertices = this->emitHullGeometry(g, emitVertexFn, "bezierpts", 4, "sk_InvocationID",
+                                             "midpoint");
 
     g->configure(GrGLSLGeometryBuilder::InputType::kLinesAdjacency,
                  GrGLSLGeometryBuilder::OutputType::kTriangleStrip,
-                 4, 1);
+                 maxVertices, 4);
 }
 
-void GrCCPRCubicInsetProcessor::emitPerVertexGeometryCode(SkString* fnBody, const char* position,
-                                                          const char* /*coverage*/,
-                                                          const char* /*wind*/) const {
-    fnBody->appendf("highp float3 klm = float3(%s, 1) * %s;", position, fKLMMatrix.c_str());
-    fnBody->appendf("%s = klm;", fKLM.gsOut());
+void GrCCPRCubicHullProcessor::onEmitPerVertexGeometryCode(SkString* fnBody) const {
+    // "klm" was just defined by the base class.
     fnBody->appendf("%s[0] = 3 * klm[0] * %s[0];", fGradMatrix.gsOut(), fKLMDerivatives.c_str());
     fnBody->appendf("%s[1] = -klm[1] * %s[2].xy - klm[2] * %s[1].xy;",
                     fGradMatrix.gsOut(), fKLMDerivatives.c_str(), fKLMDerivatives.c_str());
 }
 
-void GrCCPRCubicInsetProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
-                                                   const char* outputCoverage) const {
-    f->codeAppendf("highp float k = %s.x, l = %s.y, m = %s.z;",
-                   fKLM.fsIn(), fKLM.fsIn(), fKLM.fsIn());
+void GrCCPRCubicHullProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
+                                                  const char* outputCoverage) const {
+    f->codeAppendf("highp float k = %s.x, l = %s.y, m = %s.z, d = %s.w;",
+                   fKLMD.fsIn(), fKLMD.fsIn(), fKLMD.fsIn(), fKLMD.fsIn());
     f->codeAppend ("highp float f = k*k*k - l*m;");
-    f->codeAppendf("highp float2 grad = %s * float2(k, 1);", fGradMatrix.fsIn());
-    f->codeAppend ("highp float d = f * inversesqrt(dot(grad, grad));");
-    f->codeAppendf("%s = clamp(0.5 - d, 0, 1);", outputCoverage);
+    f->codeAppendf("highp float2 grad_f = %s * float2(k, 1);", fGradMatrix.fsIn());
+    f->codeAppendf("%s = clamp(0.5 - f * inversesqrt(dot(grad_f, grad_f)), 0, 1);", outputCoverage);
+    f->codeAppendf("%s += min(d, 0);", outputCoverage); // Flat closing edge.
 }
 
-void GrCCPRCubicBorderProcessor::emitCubicGeometry(GrGLSLGeometryBuilder* g,
+void GrCCPRCubicCornerProcessor::emitCubicGeometry(GrGLSLGeometryBuilder* g,
                                                    const char* emitVertexFn, const char* wind,
                                                    const char* rtAdjust) const {
     // We defined bezierpts in onEmitGeometryShader.
-    g->declareGlobal(fEdgeDistanceEquation);
-    g->codeAppendf("int edgeidx0 = %s > 0 ? 3 : 0;", wind);
-    g->codeAppendf("highp float2 edgept0 = bezierpts[edgeidx0];");
-    g->codeAppendf("highp float2 edgept1 = bezierpts[3 - edgeidx0];");
-    this->emitEdgeDistanceEquation(g, "edgept0", "edgept1", fEdgeDistanceEquation.c_str());
-    g->codeAppendf("%s.z += 0.5;", fEdgeDistanceEquation.c_str()); // outer = -.5, inner = .5
-
     g->declareGlobal(fEdgeDistanceDerivatives);
     g->codeAppendf("%s = %s.xy * %s.xz;",
                    fEdgeDistanceDerivatives.c_str(), fEdgeDistanceEquation.c_str(), rtAdjust);
 
-    g->declareGlobal(fEdgeSpaceTransform);
-    g->codeAppend ("highp float4 edgebbox = float4(min(bezierpts[0], bezierpts[3]) - bloat, "
-                                              "max(bezierpts[0], bezierpts[3]) + bloat);");
-    g->codeAppendf("%s.xy = 2 / float2(edgebbox.zw - edgebbox.xy);", fEdgeSpaceTransform.c_str());
-    g->codeAppendf("%s.zw = -1 - %s.xy * edgebbox.xy;",
-                   fEdgeSpaceTransform.c_str(), fEdgeSpaceTransform.c_str());
-
-    int maxVertices = this->emitHullGeometry(g, emitVertexFn, "bezierpts", 4, "sk_InvocationID",
-                                             "inset");
+    g->codeAppendf("highp float2 corner = bezierpts[sk_InvocationID * 3];");
+    int numVertices = this->emitCornerGeometry(g, emitVertexFn, "corner");
 
     g->configure(GrGLSLGeometryBuilder::InputType::kLinesAdjacency,
-                 GrGLSLGeometryBuilder::OutputType::kTriangleStrip,
-                 maxVertices, 4);
+                 GrGLSLGeometryBuilder::OutputType::kTriangleStrip, numVertices, 2);
 }
 
-void GrCCPRCubicBorderProcessor::emitPerVertexGeometryCode(SkString* fnBody, const char* position,
-                                                           const char* /*coverage*/,
-                                                           const char* /*wind*/) const {
-    fnBody->appendf("highp float3 klm = float3(%s, 1) * %s;", position, fKLMMatrix.c_str());
-    fnBody->appendf("highp float d = dot(float3(%s, 1), %s);",
-                    position, fEdgeDistanceEquation.c_str());
-    fnBody->appendf("%s = float4(klm, d);", fKLMD.gsOut());
+void GrCCPRCubicCornerProcessor::onEmitPerVertexGeometryCode(SkString* fnBody) const {
     fnBody->appendf("%s = float4(%s[0].x, %s[1].x, %s[2].x, %s.x);",
                     fdKLMDdx.gsOut(), fKLMDerivatives.c_str(), fKLMDerivatives.c_str(),
                     fKLMDerivatives.c_str(), fEdgeDistanceDerivatives.c_str());
     fnBody->appendf("%s = float4(%s[0].y, %s[1].y, %s[2].y, %s.y);",
                     fdKLMDdy.gsOut(), fKLMDerivatives.c_str(), fKLMDerivatives.c_str(),
                     fKLMDerivatives.c_str(), fEdgeDistanceDerivatives.c_str());
-    fnBody->appendf("%s = position * %s.xy + %s.zw;", fEdgeSpaceCoord.gsOut(),
-                    fEdgeSpaceTransform.c_str(), fEdgeSpaceTransform.c_str());
 
     // Otherwise, fEdgeDistances = fEdgeDistances * sign(wind * rtAdjust.x * rdAdjust.z).
     GR_STATIC_ASSERT(kTopLeft_GrSurfaceOrigin == GrCCPRCoverageProcessor::kAtlasOrigin);
 }
 
-void GrCCPRCubicBorderProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
+void GrCCPRCubicCornerProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
                                                     const char* outputCoverage) const {
-    // Use software msaa to determine coverage.
-    const int sampleCount = this->defineSoftSampleLocations(f, "samples");
-
-    // Along the shared edge, we start with distance-to-edge coverage, then subtract out the
-    // remaining pixel coverage that is still inside the shared edge, but outside the curve.
-    // Outside the shared edege, we just use standard msaa to count samples inside the curve.
-    f->codeAppendf("bool use_edge = all(lessThan(abs(%s), float2(1)));", fEdgeSpaceCoord.fsIn());
-    f->codeAppendf("%s = (use_edge ? clamp(%s.w + 0.5, 0, 1) : 0) * %i;",
-                   outputCoverage, fKLMD.fsIn(), sampleCount);
+    f->codeAppendf("highp float2x4 grad_klmd = float2x4(%s, %s);",
+                   fdKLMDdx.fsIn(), fdKLMDdy.fsIn());
 
-    f->codeAppendf("highp float2x4 grad_klmd = float2x4(%s, %s);", fdKLMDdx.fsIn(),
-                   fdKLMDdy.fsIn());
+    // Erase what the previous hull shader wrote. We don't worry about the two corners falling on
+    // the same pixel because those cases should have been weeded out by this point.
+    f->codeAppendf("highp float k = %s.x, l = %s.y, m = %s.z, d = %s.w;",
+                   fKLMD.fsIn(), fKLMD.fsIn(), fKLMD.fsIn(), fKLMD.fsIn());
+    f->codeAppend ("highp float f = k*k*k - l*m;");
+    f->codeAppend ("highp float2 grad_f = float3(3*k*k, -m, -l) * float2x3(grad_klmd);");
+    f->codeAppendf("%s = -clamp(0.5 - f * inversesqrt(dot(grad_f, grad_f)), 0, 1);",
+                   outputCoverage);
+    f->codeAppendf("%s -= d;", outputCoverage);
 
+    // Use software msaa to estimate actual coverage at the corner pixels.
+    const int sampleCount = this->defineSoftSampleLocations(f, "samples");
+    f->codeAppendf("highp float4 klmd_center = float4(%s.xyz, %s.w + 0.5);",
+                   fKLMD.fsIn(), fKLMD.fsIn());
     f->codeAppendf("for (int i = 0; i < %i; ++i) {", sampleCount);
-    f->codeAppendf(    "highp float4 klmd = grad_klmd * samples[i] + %s;", fKLMD.fsIn());
+    f->codeAppend (    "highp float4 klmd = grad_klmd * samples[i] + klmd_center;");
     f->codeAppend (    "lowp float f = klmd.y * klmd.z - klmd.x * klmd.x * klmd.x;");
-    // A sample is inside our cubic sub-section if it is inside the implicit AND L & M are both
-    // positive. This works because the sections get chopped at the K/L and K/M intersections.
-    f->codeAppend (    "bool4 inside = greaterThan(float4(f,klmd.yzw), float4(0));");
-    f->codeAppend (    "lowp float in_curve = all(inside.xyz) ? 1 : 0;");
-    f->codeAppend (    "lowp float in_edge = inside.w ? 1 : 0;");
-    f->codeAppendf(    "%s += use_edge ? in_edge * (in_curve - 1) : in_curve;", outputCoverage);
+    f->codeAppendf(    "%s += all(greaterThan(float4(f, klmd.y, klmd.z, klmd.w), "
+                                             "float4(0))) ? %f : 0;",
+                       outputCoverage, 1.0 / sampleCount);
     f->codeAppend ("}");
-
-    f->codeAppendf("%s *= %f;", outputCoverage, 1.0 / sampleCount);
 }
diff --git a/src/gpu/ccpr/GrCCPRCubicProcessor.h b/src/gpu/ccpr/GrCCPRCubicProcessor.h
index d445eeb315..cfee7bfac1 100644
--- a/src/gpu/ccpr/GrCCPRCubicProcessor.h
+++ b/src/gpu/ccpr/GrCCPRCubicProcessor.h
@@ -19,40 +19,29 @@ class GrGLSLGeometryBuilder;
  *
  * https://www.microsoft.com/en-us/research/wp-content/uploads/2005/01/p1000-loop.pdf
  *
- * The caller is expected to chop cubics at the KLM roots (a.k.a. inflection points and loop
- * intersection points, resulting in necessarily convex segments) before feeding them into this
- * processor. (Use GrCCPRGeometry.)
- *
- * The curves are rendered in two passes:
- *
- * Pass 1: Draw the (convex) bezier quadrilateral, inset by 1/2 pixel all around, and use the
- *         gradient-based AA technique outlined in the Loop/Blinn paper to compute coverage.
- *
- * Pass 2: Draw a border around the previous inset, up to the bezier quadrilatral's conservative
- *         raster hull, and compute coverage using pseudo MSAA. This pass is necessary because the
- *         gradient approach does not work near the L and M lines.
- *
- * FIXME: The pseudo MSAA border is slow and ugly. We should investigate an alternate solution of
- * just approximating the curve with straight lines for short distances across the problem points
- * instead.
+ * The provided curves must be convex, monotonic with respect to the vector of their closing edge
+ * [P3 - P0], and must not contain or be near any inflection points or loop intersections.
+ * (Use GrCCPRGeometry.)
  */
 class GrCCPRCubicProcessor : public GrCCPRCoverageProcessor::PrimitiveProcessor {
 public:
-    enum class Type {
+    enum class CubicType {
         kSerpentine,
         kLoop
     };
 
-    GrCCPRCubicProcessor(Type type)
+    GrCCPRCubicProcessor(CubicType cubicType)
             : INHERITED(CoverageType::kShader)
-            , fType(type)
-            , fInset(kVec3f_GrSLType)
+            , fCubicType(cubicType)
             , fKLMMatrix("klm_matrix", kMat33f_GrSLType, GrShaderVar::kNonArray,
                          kHigh_GrSLPrecision)
-            , fKLMDerivatives("klm_derivatives", kVec2f_GrSLType, 3, kHigh_GrSLPrecision) {}
+            , fKLMDerivatives("klm_derivatives", kVec2f_GrSLType, 3, kHigh_GrSLPrecision)
+            , fEdgeDistanceEquation("edge_distance_equation", kVec3f_GrSLType,
+                                    GrShaderVar::kNonArray, kHigh_GrSLPrecision)
+            , fKLMD(kVec4f_GrSLType) {}
 
     void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
-        varyingHandler->addVarying("insets", &fInset, kHigh_GrSLPrecision);
+        varyingHandler->addVarying("klmd", &fKLMD, kHigh_GrSLPrecision);
     }
 
     void onEmitVertexShader(const GrCCPRCoverageProcessor&, GrGLSLVertexBuilder*,
@@ -61,82 +50,69 @@ public:
     void emitWind(GrGLSLGeometryBuilder*, const char* rtAdjust, const char* outputWind) const final;
     void onEmitGeometryShader(GrGLSLGeometryBuilder*, const char* emitVertexFn, const char* wind,
                               const char* rtAdjust) const final;
+    void emitPerVertexGeometryCode(SkString* fnBody, const char* position, const char* coverage,
+                                   const char* wind) const final;
 
 protected:
     virtual void emitCubicGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
                                    const char* wind, const char* rtAdjust) const = 0;
+    virtual void onEmitPerVertexGeometryCode(SkString* fnBody) const = 0;
 
-    const Type        fType;
-    GrGLSLVertToGeo   fInset;
+    const CubicType   fCubicType;
     GrShaderVar       fKLMMatrix;
     GrShaderVar       fKLMDerivatives;
+    GrShaderVar       fEdgeDistanceEquation;
+    GrGLSLGeoToFrag   fKLMD;
 
     typedef GrCCPRCoverageProcessor::PrimitiveProcessor INHERITED;
 };
 
-class GrCCPRCubicInsetProcessor : public GrCCPRCubicProcessor {
+class GrCCPRCubicHullProcessor : public GrCCPRCubicProcessor {
 public:
-    GrCCPRCubicInsetProcessor(Type type)
-            : INHERITED(type)
-            , fKLM(kVec3f_GrSLType)
+    GrCCPRCubicHullProcessor(CubicType cubicType)
+            : INHERITED(cubicType)
             , fGradMatrix(kMat22f_GrSLType) {}
 
     void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
         this->INHERITED::resetVaryings(varyingHandler);
-        varyingHandler->addVarying("klm", &fKLM, kHigh_GrSLPrecision);
         varyingHandler->addVarying("grad_matrix", &fGradMatrix, kHigh_GrSLPrecision);
     }
 
     void emitCubicGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
                            const char* wind, const char* rtAdjust) const override;
-    void emitPerVertexGeometryCode(SkString* fnBody, const char* position, const char* coverage,
-                                   const char* wind) const override;
+    void onEmitPerVertexGeometryCode(SkString* fnBody) const override;
     void emitShaderCoverage(GrGLSLFragmentBuilder*, const char* outputCoverage) const override;
 
 protected:
-    GrGLSLGeoToFrag   fKLM;
     GrGLSLGeoToFrag   fGradMatrix;
 
     typedef GrCCPRCubicProcessor INHERITED;
 };
 
-class GrCCPRCubicBorderProcessor : public GrCCPRCubicProcessor {
+class GrCCPRCubicCornerProcessor : public GrCCPRCubicProcessor {
 public:
-    GrCCPRCubicBorderProcessor(Type type)
-            : INHERITED(type)
-            , fEdgeDistanceEquation("edge_distance_equation", kVec3f_GrSLType,
-                                    GrShaderVar::kNonArray, kHigh_GrSLPrecision)
+    GrCCPRCubicCornerProcessor(CubicType cubicType)
+            : INHERITED(cubicType)
             , fEdgeDistanceDerivatives("edge_distance_derivatives", kVec2f_GrSLType,
                                         GrShaderVar::kNonArray, kHigh_GrSLPrecision)
-            , fEdgeSpaceTransform("edge_space_transform", kVec4f_GrSLType, GrShaderVar::kNonArray,
-                                  kHigh_GrSLPrecision)
-            , fKLMD(kVec4f_GrSLType)
             , fdKLMDdx(kVec4f_GrSLType)
-            , fdKLMDdy(kVec4f_GrSLType)
-            , fEdgeSpaceCoord(kVec2f_GrSLType) {}
+            , fdKLMDdy(kVec4f_GrSLType) {}
 
     void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
         this->INHERITED::resetVaryings(varyingHandler);
-        varyingHandler->addVarying("klmd", &fKLMD, kHigh_GrSLPrecision);
         varyingHandler->addFlatVarying("dklmddx", &fdKLMDdx, kHigh_GrSLPrecision);
         varyingHandler->addFlatVarying("dklmddy", &fdKLMDdy, kHigh_GrSLPrecision);
-        varyingHandler->addVarying("edge_space_coord", &fEdgeSpaceCoord, kHigh_GrSLPrecision);
     }
 
     void emitCubicGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
                            const char* wind, const char* rtAdjust) const override;
-    void emitPerVertexGeometryCode(SkString* fnBody, const char* position, const char* coverage,
-                                   const char* wind) const override;
+    void onEmitPerVertexGeometryCode(SkString* fnBody) const override;
     void emitShaderCoverage(GrGLSLFragmentBuilder*, const char* outputCoverage) const override;
 
 protected:
-    GrShaderVar        fEdgeDistanceEquation;
     GrShaderVar        fEdgeDistanceDerivatives;
-    GrShaderVar        fEdgeSpaceTransform;
-    GrGLSLGeoToFrag    fKLMD;
     GrGLSLGeoToFrag    fdKLMDdx;
     GrGLSLGeoToFrag    fdKLMDdy;
-    GrGLSLGeoToFrag    fEdgeSpaceCoord;
 
     typedef GrCCPRCubicProcessor INHERITED;
 };
diff --git a/src/gpu/ccpr/GrCCPRGeometry.cpp b/src/gpu/ccpr/GrCCPRGeometry.cpp
index a2c08908bf..4ba4f54c63 100644
--- a/src/gpu/ccpr/GrCCPRGeometry.cpp
+++ b/src/gpu/ccpr/GrCCPRGeometry.cpp
@@ -8,9 +8,7 @@
 #include "GrCCPRGeometry.h"
 
 #include "GrTypes.h"
-#include "SkGeometry.h"
-#include "SkPoint.h"
-#include "../pathops/SkPathOpsCubic.h"
+#include "GrPathUtils.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
@@ -126,84 +124,403 @@ inline void GrCCPRGeometry::appendMonotonicQuadratic(const Sk2f& p1, const Sk2f&
     ++fCurrContourTallies.fQuadratics;
 }
 
-void GrCCPRGeometry::cubicTo(const SkPoint& devP1, const SkPoint& devP2, const SkPoint& devP3) {
+using ExcludedTerm = GrPathUtils::ExcludedTerm;
+
+// Calculates the padding to apply around inflection points, in homogeneous parametric coordinates.
+//
+// More specifically, if the inflection point lies at C(t/s), then C((t +/- returnValue) / s) will
+// be the two points on the curve at which a square box with radius "padRadius" will have a corner
+// that touches the inflection point's tangent line.
+//
+// A serpentine cubic has two inflection points, so this method takes Sk2f and computes the padding
+// for both in SIMD.
+static inline Sk2f calc_inflect_homogeneous_padding(float padRadius, const Sk2f& t, const Sk2f& s,
+                                                    const SkMatrix& CIT, ExcludedTerm skipTerm) {
+    SkASSERT(padRadius >= 0);
+
+    Sk2f Clx = s*s*s;
+    Sk2f Cly = (ExcludedTerm::kLinearTerm == skipTerm) ? s*s*t*-3 : s*t*t*3;
+
+    Sk2f Lx = CIT[0] * Clx + CIT[3] * Cly;
+    Sk2f Ly = CIT[1] * Clx + CIT[4] * Cly;
+
+    float ret[2];
+    Sk2f bloat = padRadius * (Lx.abs() + Ly.abs());
+    (bloat * s >= 0).thenElse(bloat, -bloat).store(ret);
+
+    ret[0] = cbrtf(ret[0]);
+    ret[1] = cbrtf(ret[1]);
+    return Sk2f::Load(ret);
+}
+
+static inline void swap_if_greater(float& a, float& b) {
+    if (a > b) {
+        std::swap(a, b);
+    }
+}
+
+// Calculates all parameter values for a loop at which points a square box with radius "padRadius"
+// will have a corner that touches a tangent line from the intersection.
+//
+// T2 must contain the lesser parameter value of the loop intersection in its first component, and
+// the greater in its second.
+//
+// roots[0] will be filled with 1 or 3 sorted parameter values, representing the padding points
+// around the first tangent. roots[1] will be filled with the padding points for the second tangent.
+static inline void calc_loop_intersect_padding_pts(float padRadius, const Sk2f& T2,
+                                                  const SkMatrix& CIT, ExcludedTerm skipTerm,
+                                                  SkSTArray<3, float, true> roots[2]) {
+    SkASSERT(padRadius >= 0);
+    SkASSERT(T2[0] <= T2[1]);
+    SkASSERT(roots[0].empty());
+    SkASSERT(roots[1].empty());
+
+    Sk2f T1 = SkNx_shuffle<1,0>(T2);
+    Sk2f Cl = (ExcludedTerm::kLinearTerm == skipTerm) ? T2*-2 - T1 : T2*T2 + T2*T1*2;
+    Sk2f Lx = Cl * CIT[3] + CIT[0];
+    Sk2f Ly = Cl * CIT[4] + CIT[1];
+
+    Sk2f bloat = Sk2f(+.5f * padRadius, -.5f * padRadius) * (Lx.abs() + Ly.abs());
+    Sk2f q = (1.f/3) * (T2 - T1);
+
+    Sk2f qqq = q*q*q;
+    Sk2f discr = qqq*bloat*2 + bloat*bloat;
+
+    float numRoots[2], D[2];
+    (discr < 0).thenElse(3, 1).store(numRoots);
+    (T2 - q).store(D);
+
+    // Values for calculating one root.
+    float R[2], QQ[2];
+    if ((discr >= 0).anyTrue()) {
+        Sk2f r = qqq + bloat;
+        Sk2f s = r.abs() + discr.sqrt();
+        (r > 0).thenElse(-s, s).store(R);
+        (q*q).store(QQ);
+    }
+
+    // Values for calculating three roots.
+    float P[2], cosTheta3[2];
+    if ((discr < 0).anyTrue()) {
+        (q.abs() * -2).store(P);
+        ((q >= 0).thenElse(1, -1) + bloat / qqq.abs()).store(cosTheta3);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+        if (1 == numRoots[i]) {
+            float A = cbrtf(R[i]);
+            float B = A != 0 ? QQ[i]/A : 0;
+            roots[i].push_back(A + B + D[i]);
+            continue;
+        }
+
+        static constexpr float k2PiOver3 = 2 * SK_ScalarPI / 3;
+        float theta = std::acos(cosTheta3[i]) * (1.f/3);
+        roots[i].push_back(P[i] * std::cos(theta) + D[i]);
+        roots[i].push_back(P[i] * std::cos(theta + k2PiOver3) + D[i]);
+        roots[i].push_back(P[i] * std::cos(theta - k2PiOver3) + D[i]);
+
+        // Sort the three roots.
+        swap_if_greater(roots[i][0], roots[i][1]);
+        swap_if_greater(roots[i][1], roots[i][2]);
+        swap_if_greater(roots[i][0], roots[i][1]);
+    }
+}
+
+void GrCCPRGeometry::cubicTo(const SkPoint& devP1, const SkPoint& devP2, const SkPoint& devP3,
+                             float inflectPad, float loopIntersectPad) {
     SkASSERT(fBuildingContour);
 
-    SkPoint P[4] = {fCurrFanPoint, devP1, devP2, devP3};
-    double t[2], s[2];
-    SkCubicType type = SkClassifyCubic(P, t, s);
+    SkPoint devPts[4] = {fCurrFanPoint, devP1, devP2, devP3};
+    Sk2f p0 = Sk2f::Load(&fCurrFanPoint);
+    Sk2f p1 = Sk2f::Load(&devP1);
+    Sk2f p2 = Sk2f::Load(&devP2);
+    Sk2f p3 = Sk2f::Load(&devP3);
+    fCurrFanPoint = devP3;
 
-    if (SkCubicType::kLineOrPoint == type) {
-        this->lineTo(P[3]);
+    double tt[2], ss[2];
+    fCurrCubicType = SkClassifyCubic(devPts, tt, ss);
+    if (SkCubicIsDegenerate(fCurrCubicType)) {
+        // Allow one subdivision in case the curve is quadratic, but not monotonic.
+        this->appendCubicApproximation(p0, p1, p2, p3, /*maxSubdivisions=*/1);
         return;
     }
 
-    if (SkCubicType::kQuadratic == type) {
-        SkPoint quadP1 = (devP1 + devP2) * .75f - (fCurrFanPoint + devP3) * .25f;
-        this->quadraticTo(quadP1, devP3);
+    SkMatrix CIT;
+    ExcludedTerm skipTerm = GrPathUtils::calcCubicInverseTransposePowerBasisMatrix(devPts, &CIT);
+    if (ExcludedTerm::kNonInvertible == skipTerm) {
+        // This could technically also happen if the curve were a quadratic, but SkClassifyCubic
+        // should have detected that case already with tolerance.
+        fCurrCubicType = SkCubicType::kLineOrPoint;
+        this->appendCubicApproximation(p0, p1, p2, p3, /*maxSubdivisions=*/0);
         return;
     }
+    SkASSERT(0 == CIT[6]);
+    SkASSERT(0 == CIT[7]);
+    SkASSERT(1 == CIT[8]);
 
-    fCurrFanPoint = devP3;
+    // Each cubic has five different sections (not always inside t=[0..1]):
+    //
+    //   1. The section before the first inflection or loop intersection point, with padding.
+    //   2. The section that passes through the first inflection/intersection (aka the K,L
+    //      intersection point or T=tt[0]/ss[0]).
+    //   3. The section between the two inflections/intersections, with padding.
+    //   4. The section that passes through the second inflection/intersection (aka the K,M
+    //      intersection point or T=tt[1]/ss[1]).
+    //   5. The section after the second inflection/intersection, with padding.
+    //
+    // Sections 1,3,5 can be rendered directly using the CCPR cubic shader.
+    //
+    // Sections 2 & 4 must be approximated. For loop intersections we render them with
+    // quadratic(s), and when passing through an inflection point we use a plain old flat line.
+    //
+    // We find T0..T3 below to be the dividing points between these five sections.
+    float T0, T1, T2, T3;
+    if (SkCubicType::kLoop != fCurrCubicType) {
+        Sk2f t = Sk2f(static_cast<float>(tt[0]), static_cast<float>(tt[1]));
+        Sk2f s = Sk2f(static_cast<float>(ss[0]), static_cast<float>(ss[1]));
+        Sk2f pad = calc_inflect_homogeneous_padding(inflectPad, t, s, CIT, skipTerm);
+
+        float T[2];
+        ((t - pad) / s).store(T);
+        T0 = T[0];
+        T2 = T[1];
+
+        ((t + pad) / s).store(T);
+        T1 = T[0];
+        T3 = T[1];
+    } else {
+        const float T[2] = {static_cast<float>(tt[0]/ss[0]), static_cast<float>(tt[1]/ss[1])};
+        SkSTArray<3, float, true> roots[2];
+        calc_loop_intersect_padding_pts(loopIntersectPad, Sk2f::Load(T), CIT, skipTerm, roots);
+        T0 = roots[0].front();
+        if (1 == roots[0].count() || 1 == roots[1].count()) {
+            // The loop is tighter than our desired padding. Collapse the middle section to a point
+            // somewhere in the middle-ish of the loop and Sections 2 & 4 will approximate the the
+            // whole thing with quadratics.
+            T1 = T2 = (T[0] + T[1]) * .5f;
+        } else {
+            T1 = roots[0][1];
+            T2 = roots[1][1];
+        }
+        T3 = roots[1].back();
+    }
 
-    SkDCubic C;
-    C.set(P);
+    // Guarantee that T0..T3 are monotonic.
+    if (T0 > T3) {
+        // This is not a mathematically valid scenario. The only reason it would happen is if
+        // padding is very small and we have encountered FP rounding error.
+        T0 = T1 = T2 = T3 = (T0 + T3) / 2;
+    } else if (T1 > T2) {
+        // This just means padding before the middle section overlaps the padding after it. We
+        // collapse the middle section to a single point that splits the difference between the
+        // overlap in padding.
+        T1 = T2 = (T1 + T2) / 2;
+    }
+    // Clamp T1 & T2 inside T0..T3. The only reason this would be necessary is if we have
+    // encountered FP rounding error.
+    T1 = std::max(T0, std::min(T1, T3));
+    T2 = std::max(T0, std::min(T2, T3));
+
+    // Next we chop the cubic up at all T0..T3 inside 0..1 and store the resulting segments.
+    if (T1 >= 1) {
+        // Only sections 1 & 2 can be in 0..1.
+        this->chopCubic<&GrCCPRGeometry::appendMonotonicCubics,
+                        &GrCCPRGeometry::appendCubicApproximation>(p0, p1, p2, p3, T0);
+        return;
+    }
 
-    for (int x = 0; x <= 1; ++x) {
-        if (t[x] * s[x] <= 0) { // This is equivalent to tx/sx <= 0.
-            // This technically also gets taken if tx/sx = infinity, but the code still does
-            // the right thing in that edge case.
-            continue; // Don't increment x0.
-        }
-        if (fabs(t[x]) >= fabs(s[x])) { // tx/sx >= 1.
-            break;
-        }
+    if (T2 <= 0) {
+        // Only sections 4 & 5 can be in 0..1.
+        this->chopCubic<&GrCCPRGeometry::appendCubicApproximation,
+                        &GrCCPRGeometry::appendMonotonicCubics>(p0, p1, p2, p3, T3);
+        return;
+    }
 
-        const double chopT = double(t[x]) / double(s[x]);
-        SkASSERT(chopT >= 0 && chopT <= 1);
-        if (chopT <= 0 || chopT >= 1) { // floating-point error.
-            continue;
+    Sk2f midp0, midp1; // These hold the first two bezier points of the middle section, if needed.
+
+    if (T1 > 0) {
+        Sk2f T1T1 = Sk2f(T1);
+        Sk2f ab1 = lerp(p0, p1, T1T1);
+        Sk2f bc1 = lerp(p1, p2, T1T1);
+        Sk2f cd1 = lerp(p2, p3, T1T1);
+        Sk2f abc1 = lerp(ab1, bc1, T1T1);
+        Sk2f bcd1 = lerp(bc1, cd1, T1T1);
+        Sk2f abcd1 = lerp(abc1, bcd1, T1T1);
+
+        // Sections 1 & 2.
+        this->chopCubic<&GrCCPRGeometry::appendMonotonicCubics,
+                        &GrCCPRGeometry::appendCubicApproximation>(p0, ab1, abc1, abcd1, T0/T1);
+
+        if (T2 >= 1) {
+            // The rest of the curve is Section 3 (middle section).
+            this->appendMonotonicCubics(abcd1, bcd1, cd1, p3);
+            return;
         }
 
-        SkDCubicPair chopped = C.chopAt(chopT);
+        // Now calculate the first two bezier points of the middle section. The final two will come
+        // from when we chop the other side, as that is numerically more stable.
+        midp0 = abcd1;
+        midp1 = lerp(abcd1, bcd1, Sk2f((T2 - T1) / (1 - T1)));
+    } else if (T2 >= 1) {
+        // The entire cubic is Section 3 (middle section).
+        this->appendMonotonicCubics(p0, p1, p2, p3);
+        return;
+    }
 
-        // Ensure the double points are identical if this is a loop (more workarounds for FP error).
-        if (SkCubicType::kLoop == type && 0 == t[0]) {
-            chopped.pts[3] = chopped.pts[0];
-        }
+    SkASSERT(T2 > 0 && T2 < 1);
+
+    Sk2f T2T2 = Sk2f(T2);
+    Sk2f ab2 = lerp(p0, p1, T2T2);
+    Sk2f bc2 = lerp(p1, p2, T2T2);
+    Sk2f cd2 = lerp(p2, p3, T2T2);
+    Sk2f abc2 = lerp(ab2, bc2, T2T2);
+    Sk2f bcd2 = lerp(bc2, cd2, T2T2);
+    Sk2f abcd2 = lerp(abc2, bcd2, T2T2);
+
+    if (T1 <= 0) {
+        // The curve begins at Section 3 (middle section).
+        this->appendMonotonicCubics(p0, ab2, abc2, abcd2);
+    } else if (T2 > T1) {
+        // Section 3 (middle section).
+        Sk2f midp2 = lerp(abc2, abcd2, T1/T2);
+        this->appendMonotonicCubics(midp0, midp1, midp2, abcd2);
+    }
+
+    // Sections 4 & 5.
+    this->chopCubic<&GrCCPRGeometry::appendCubicApproximation,
+                    &GrCCPRGeometry::appendMonotonicCubics>(abcd2, bcd2, cd2, p3, (T3-T2) / (1-T2));
+}
 
-        // (This might put ts0/ts1 out of order, but it doesn't matter anymore at this point.)
-        this->appendConvexCubic(type, chopped.first());
-        t[x] = 0;
-        s[x] = 1;
+static inline Sk2f first_unless_nearly_zero(const Sk2f& a, const Sk2f& b) {
+    Sk2f aa = a*a;
+    aa += SkNx_shuffle<1,0>(aa);
+    SkASSERT(aa[0] == aa[1]);
 
-        const double r = s[1 - x] * chopT;
-        t[1 - x] -= r;
-        s[1 - x] -= r;
+    Sk2f bb = b*b;
+    bb += SkNx_shuffle<1,0>(bb);
+    SkASSERT(bb[0] == bb[1]);
+
+    return (aa > bb * SK_ScalarNearlyZero).thenElse(a, b);
+}
 
-        C = chopped.second();
+template<GrCCPRGeometry::AppendCubicFn AppendLeftRight>
+inline void GrCCPRGeometry::chopCubicAtMidTangent(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2,
+                                                  const Sk2f& p3, const Sk2f& tan0,
+                                                  const Sk2f& tan3, int maxFutureSubdivisions) {
+    // Find the T value whose tangent is perpendicular to the vector that bisects tan0 and -tan3.
+    Sk2f n = normalize(tan0) - normalize(tan3);
+
+    float a = 3 * dot(p3 + (p1 - p2)*3 - p0, n);
+    float b = 6 * dot(p0 - p1*2 + p2, n);
+    float c = 3 * dot(p1 - p0, n);
+
+    float discr = b*b - 4*a*c;
+    if (discr < 0) {
+        // If this is the case then the cubic must be nearly flat.
+        (this->*AppendLeftRight)(p0, p1, p2, p3, maxFutureSubdivisions);
+        return;
     }
 
-    this->appendConvexCubic(type, C);
+    float q = -.5f * (b + copysignf(std::sqrt(discr), b));
+    float m = .5f*q*a;
+    float T = std::abs(q*q - m) < std::abs(a*c - m) ? q/a : c/q;
+
+    this->chopCubic<AppendLeftRight, AppendLeftRight>(p0, p1, p2, p3, T, maxFutureSubdivisions);
 }
 
-static SkPoint to_skpoint(const SkDPoint& dpoint) {
-    return {static_cast<SkScalar>(dpoint.fX), static_cast<SkScalar>(dpoint.fY)};
+template<GrCCPRGeometry::AppendCubicFn AppendLeft, GrCCPRGeometry::AppendCubicFn AppendRight>
+inline void GrCCPRGeometry::chopCubic(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2,
+                                      const Sk2f& p3, float T, int maxFutureSubdivisions) {
+    if (T >= 1) {
+        (this->*AppendLeft)(p0, p1, p2, p3, maxFutureSubdivisions);
+        return;
+    }
+
+    if (T <= 0) {
+        (this->*AppendRight)(p0, p1, p2, p3, maxFutureSubdivisions);
+        return;
+    }
+
+    Sk2f TT = T;
+    Sk2f ab = lerp(p0, p1, TT);
+    Sk2f bc = lerp(p1, p2, TT);
+    Sk2f cd = lerp(p2, p3, TT);
+    Sk2f abc = lerp(ab, bc, TT);
+    Sk2f bcd = lerp(bc, cd, TT);
+    Sk2f abcd = lerp(abc, bcd, TT);
+    (this->*AppendLeft)(p0, ab, abc, abcd, maxFutureSubdivisions);
+    (this->*AppendRight)(abcd, bcd, cd, p3, maxFutureSubdivisions);
 }
 
-inline void GrCCPRGeometry::appendConvexCubic(SkCubicType type, const SkDCubic& C) {
-    fPoints.push_back(to_skpoint(C[1]));
-    fPoints.push_back(to_skpoint(C[2]));
-    fPoints.push_back(to_skpoint(C[3]));
-    if (SkCubicType::kLoop != type) {
-        fVerbs.push_back(Verb::kConvexSerpentineTo);
+void GrCCPRGeometry::appendMonotonicCubics(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2,
+                                           const Sk2f& p3, int maxSubdivisions) {
+    if ((p0 == p3).allTrue()) {
+        return;
+    }
+
+    if (maxSubdivisions) {
+        Sk2f tan0 = first_unless_nearly_zero(p1 - p0, p2 - p0);
+        Sk2f tan3 = first_unless_nearly_zero(p3 - p2, p3 - p1);
+
+        if (!is_convex_curve_monotonic(p0, tan0, p3, tan3)) {
+            this->chopCubicAtMidTangent<&GrCCPRGeometry::appendMonotonicCubics>(p0, p1, p2, p3,
+                                                                                tan0, tan3,
+                                                                                maxSubdivisions-1);
+            return;
+        }
+    }
+
+    SkASSERT(fPoints.back() == SkPoint::Make(p0[0], p0[1]));
+    p1.store(&fPoints.push_back());
+    p2.store(&fPoints.push_back());
+    p3.store(&fPoints.push_back());
+    if (SkCubicType::kLoop != fCurrCubicType) {
+        fVerbs.push_back(Verb::kMonotonicSerpentineTo);
         ++fCurrContourTallies.fSerpentines;
     } else {
-        fVerbs.push_back(Verb::kConvexLoopTo);
+        fVerbs.push_back(Verb::kMonotonicLoopTo);
         ++fCurrContourTallies.fLoops;
     }
 }
 
+void GrCCPRGeometry::appendCubicApproximation(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2,
+                                              const Sk2f& p3, int maxSubdivisions) {
+    if ((p0 == p3).allTrue()) {
+        return;
+    }
+
+    if (SkCubicType::kLoop != fCurrCubicType && SkCubicType::kQuadratic != fCurrCubicType) {
+        // This section passes through an inflection point, so we can get away with a flat line.
+        // This can cause some curves to feel slightly more flat when inspected rigorously back and
+        // forth against another renderer, but for now this seems acceptable given the simplicity.
+        SkASSERT(fPoints.back() == SkPoint::Make(p0[0], p0[1]));
+        p3.store(&fPoints.push_back());
+        fVerbs.push_back(Verb::kLineTo);
+        return;
+    }
+
+    Sk2f tan0 = first_unless_nearly_zero(p1 - p0, p2 - p0);
+    Sk2f tan3 = first_unless_nearly_zero(p3 - p2, p3 - p1);
+
+    Sk2f c1 = SkNx_fma(Sk2f(1.5f), tan0, p0);
+    Sk2f c2 = SkNx_fma(Sk2f(-1.5f), tan3, p3);
+
+    if (maxSubdivisions) {
+        bool nearlyQuadratic = ((c1 - c2).abs() <= 1).allTrue();
+
+        if (!nearlyQuadratic || !is_convex_curve_monotonic(p0, tan0, p3, tan3)) {
+            this->chopCubicAtMidTangent<&GrCCPRGeometry::appendCubicApproximation>(p0, p1, p2, p3,
+                                                                                   tan0, tan3,
+                                                                                 maxSubdivisions-1);
+            return;
+        }
+    }
+
+    SkASSERT(fPoints.back() == SkPoint::Make(p0[0], p0[1]));
+    this->appendMonotonicQuadratic((c1 + c2) * .5f, p3);
+}
+
 GrCCPRGeometry::PrimitiveTallies GrCCPRGeometry::endContour() {
     SkASSERT(fBuildingContour);
     SkASSERT(fVerbs.count() >= fCurrContourTallies.fTriangles);
diff --git a/src/gpu/ccpr/GrCCPRGeometry.h b/src/gpu/ccpr/GrCCPRGeometry.h
index 72b84d5a77..ee06f78a9a 100644
--- a/src/gpu/ccpr/GrCCPRGeometry.h
+++ b/src/gpu/ccpr/GrCCPRGeometry.h
@@ -8,13 +8,11 @@
 #ifndef GrGrCCPRGeometry_DEFINED
 #define GrGrCCPRGeometry_DEFINED
 
+#include "SkGeometry.h"
 #include "SkNx.h"
 #include "SkPoint.h"
 #include "SkTArray.h"
 
-struct SkDCubic;
-enum class SkCubicType;
-
 /**
  * This class chops device-space contours up into a series of segments that CCPR knows how to
  * render. (See GrCCPRGeometry::Verb.)
@@ -32,8 +30,8 @@ public:
         kBeginContour,
         kLineTo,
         kMonotonicQuadraticTo, // Monotonic relative to the vector between its endpoints [P2 - P0].
-        kConvexSerpentineTo,
-        kConvexLoopTo,
+        kMonotonicSerpentineTo,
+        kMonotonicLoopTo,
         kEndClosedContour, // endPt == startPt.
         kEndOpenContour // endPt != startPt.
     };
@@ -77,17 +75,50 @@ public:
     void beginContour(const SkPoint& devPt);
     void lineTo(const SkPoint& devPt);
     void quadraticTo(const SkPoint& devP1, const SkPoint& devP2);
-    void cubicTo(const SkPoint& devP1, const SkPoint& devP2, const SkPoint& devP3);
+
+    // We pass through inflection points and loop intersections using a line and quadratic(s)
+    // respectively. 'inflectPad' and 'loopIntersectPad' specify how close (in pixels) cubic
+    // segments are allowed to get to these points. For normal rendering you will want to use the
+    // default values, but these can be overridden for testing purposes.
+    //
+    // NOTE: loops do appear to require two full pixels of padding around the intersection point.
+    //       With just one pixel-width of pad, we start to see bad pixels. Ultimately this has a
+    //       minimal effect on the total amount of segments produced. Most sections that pass
+    //       through the loop intersection can be approximated with a single quadratic anyway,
+    //       regardless of whether we are use one pixel of pad or two (1.622 avg. quads per loop
+    //       intersection vs. 1.489 on the tiger).
+    void cubicTo(const SkPoint& devP1, const SkPoint& devP2, const SkPoint& devP3,
+                 float inflectPad = 0.55f, float loopIntersectPad = 2);
+
     PrimitiveTallies endContour(); // Returns the numbers of primitives needed to draw the contour.
 
 private:
     inline void appendMonotonicQuadratic(const Sk2f& p1, const Sk2f& p2);
-    inline void appendConvexCubic(SkCubicType, const SkDCubic&);
+
+    using AppendCubicFn = void(GrCCPRGeometry::*)(const Sk2f& p0, const Sk2f& p1,
+                                                  const Sk2f& p2, const Sk2f& p3,
+                                                  int maxSubdivisions);
+    static constexpr int kMaxSubdivionsPerCubicSection = 2;
+
+    template<AppendCubicFn AppendLeftRight>
+    inline void chopCubicAtMidTangent(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2,
+                                      const Sk2f& p3, const Sk2f& tan0, const Sk2f& tan3,
+                                      int maxFutureSubdivisions = kMaxSubdivionsPerCubicSection);
+
+    template<AppendCubicFn AppendLeft, AppendCubicFn AppendRight>
+    inline void chopCubic(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2, const Sk2f& p3,
+                          float T, int maxFutureSubdivisions = kMaxSubdivionsPerCubicSection);
+
+    void appendMonotonicCubics(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2, const Sk2f& p3,
+                               int maxSubdivisions = kMaxSubdivionsPerCubicSection);
+    void appendCubicApproximation(const Sk2f& p0, const Sk2f& p1, const Sk2f& p2, const Sk2f& p3,
+                                  int maxSubdivisions = kMaxSubdivionsPerCubicSection);
 
     // Transient state used while building a contour.
     SkPoint                         fCurrAnchorPoint;
     SkPoint                         fCurrFanPoint;
     PrimitiveTallies                fCurrContourTallies;
+    SkCubicType                     fCurrCubicType;
     SkDEBUGCODE(bool                fBuildingContour = false);
 
     // TODO: These points could eventually be written directly to block-allocated GPU buffers.