8 files changed, 2714 insertions, 5 deletions
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
new file mode 100644
index 0000000000..cf2dfdc09f
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -0,0 +1,686 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkLinearBitmapPipeline.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <tuple>
+
+#include "SkArenaAlloc.h"
+#include "SkLinearBitmapPipeline_core.h"
+#include "SkLinearBitmapPipeline_matrix.h"
+#include "SkLinearBitmapPipeline_tile.h"
+#include "SkLinearBitmapPipeline_sample.h"
+#include "SkNx.h"
+#include "SkOpts.h"
+#include "SkPM4f.h"
+
+namespace  {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Matrix Stage
+// PointProcessor uses a strategy to help complete the work of the different stages. The strategy
+// must implement the following methods:
+// * processPoints(xs, ys) - must mutate the xs and ys for the stage.
+// * maybeProcessSpan(span, next) - This represents a horizontal series of pixels
+//   to work over.
+//   span - encapsulation of span.
+//   next - a pointer to the next stage.
+//   maybeProcessSpan - returns false if it can not process the span and needs to fallback to
+//                      point lists for processing.
+template<typename Strategy, typename Next>
+class MatrixStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+public:
+    template <typename... Args>
+    MatrixStage(Next* next, Args&&... args)
+        : fNext{next}
+        , fStrategy{std::forward<Args>(args)...}{ }
+
+    MatrixStage(Next* next, MatrixStage* stage)
+        : fNext{next}
+        , fStrategy{stage->fStrategy} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        fStrategy.processPoints(&xs, &ys);
+        fNext->pointListFew(n, xs, ys);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fStrategy.processPoints(&xs, &ys);
+        fNext->pointList4(xs, ys);
+    }
+
+    // The span you pass must not be empty.
+    void pointSpan(Span span) override {
+        SkASSERT(!span.isEmpty());
+        if (!fStrategy.maybeProcessSpan(span, fNext)) {
+            span_fallback(span, this);
+        }
+    }
+
+private:
+    Next* const fNext;
+    Strategy fStrategy;
+};
+
+template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
+using TranslateMatrix = MatrixStage<TranslateMatrixStrategy, Next>;
+
+template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
+using ScaleMatrix = MatrixStage<ScaleMatrixStrategy, Next>;
+
+template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
+using AffineMatrix = MatrixStage<AffineMatrixStrategy, Next>;
+
+template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
+using PerspectiveMatrix = MatrixStage<PerspectiveMatrixStrategy, Next>;
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile Stage
+
+template<typename XStrategy, typename YStrategy, typename Next>
+class CombinedTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+public:
+    CombinedTileStage(Next* next, SkISize dimensions)
+        : fNext{next}
+        , fXStrategy{dimensions.width()}
+        , fYStrategy{dimensions.height()}{ }
+
+    CombinedTileStage(Next* next, CombinedTileStage* stage)
+        : fNext{next}
+        , fXStrategy{stage->fXStrategy}
+        , fYStrategy{stage->fYStrategy} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        fNext->pointListFew(n, xs, ys);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        fNext->pointList4(xs, ys);
+    }
+
+    // The span you pass must not be empty.
+    void pointSpan(Span span) override {
+        SkASSERT(!span.isEmpty());
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+
+        if (span.count() == 1) {
+            // DANGER:
+            // The explicit casts from float to Sk4f are not usually necessary, but are here to
+            // work around an MSVC 2015u2 c++ code generation bug. This is tracked using skia bug
+            // 5566.
+            this->pointListFew(1, Sk4f{span.startX()}, Sk4f{span.startY()});
+            return;
+        }
+
+        SkScalar x = X(start);
+        SkScalar y = fYStrategy.tileY(Y(start));
+        Span yAdjustedSpan{{x, y}, length, count};
+
+        if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
+            span_fallback(span, this);
+        }
+    }
+
+private:
+    Next* const fNext;
+    XStrategy fXStrategy;
+    YStrategy fYStrategy;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized Samplers
+
+// RGBA8888UnitRepeatSrc - A sampler that takes advantage of the fact the the src and destination
+// are the same format and do not need in transformations in pixel space. Therefore, there is no
+// need to convert them to HiFi pixel format.
+class RGBA8888UnitRepeatSrc final : public SkLinearBitmapPipeline::SampleProcessorInterface,
+                                    public SkLinearBitmapPipeline::DestinationInterface {
+public:
+    RGBA8888UnitRepeatSrc(const uint32_t* src, int32_t width)
+        : fSrc{src}, fWidth{width} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        SkASSERT(fDest + n <= fEnd);
+        // At this point xs and ys should be >= 0, so trunc is the same as floor.
+        Sk4i iXs = SkNx_cast<int>(xs);
+        Sk4i iYs = SkNx_cast<int>(ys);
+
+        if (n >= 1) *fDest++ = *this->pixelAddress(iXs[0], iYs[0]);
+        if (n >= 2) *fDest++ = *this->pixelAddress(iXs[1], iYs[1]);
+        if (n >= 3) *fDest++ = *this->pixelAddress(iXs[2], iYs[2]);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        SkASSERT(fDest + 4 <= fEnd);
+        Sk4i iXs = SkNx_cast<int>(xs);
+        Sk4i iYs = SkNx_cast<int>(ys);
+        *fDest++ = *this->pixelAddress(iXs[0], iYs[0]);
+        *fDest++ = *this->pixelAddress(iXs[1], iYs[1]);
+        *fDest++ = *this->pixelAddress(iXs[2], iYs[2]);
+        *fDest++ = *this->pixelAddress(iXs[3], iYs[3]);
+    }
+
+    void pointSpan(Span span) override {
+        SkASSERT(fDest + span.count() <= fEnd);
+        if (span.length() != 0.0f) {
+            int32_t x = SkScalarTruncToInt(span.startX());
+            int32_t y = SkScalarTruncToInt(span.startY());
+            const uint32_t* src = this->pixelAddress(x, y);
+            memmove(fDest, src, span.count() * sizeof(uint32_t));
+            fDest += span.count();
+        }
+    }
+
+    void repeatSpan(Span span, int32_t repeatCount) override {
+        SkASSERT(fDest + span.count() * repeatCount <= fEnd);
+
+        int32_t x = SkScalarTruncToInt(span.startX());
+        int32_t y = SkScalarTruncToInt(span.startY());
+        const uint32_t* src = this->pixelAddress(x, y);
+        uint32_t* dest = fDest;
+        while (repeatCount --> 0) {
+            memmove(dest, src, span.count() * sizeof(uint32_t));
+            dest += span.count();
+        }
+        fDest = dest;
+    }
+
+    void setDestination(void* dst, int count) override  {
+        fDest = static_cast<uint32_t*>(dst);
+        fEnd = fDest + count;
+    }
+
+private:
+    const uint32_t* pixelAddress(int32_t x, int32_t y) {
+        return &fSrc[fWidth * y + x];
+    }
+    const uint32_t* const fSrc;
+    const int32_t         fWidth;
+    uint32_t*             fDest;
+    uint32_t*             fEnd;
+};
+
+// RGBA8888UnitRepeatSrc - A sampler that takes advantage of the fact the the src and destination
+// are the same format and do not need in transformations in pixel space. Therefore, there is no
+// need to convert them to HiFi pixel format.
+class RGBA8888UnitRepeatSrcOver final : public SkLinearBitmapPipeline::SampleProcessorInterface,
+                                        public SkLinearBitmapPipeline::DestinationInterface {
+public:
+    RGBA8888UnitRepeatSrcOver(const uint32_t* src, int32_t width)
+        : fSrc{src}, fWidth{width} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        SkASSERT(fDest + n <= fEnd);
+        // At this point xs and ys should be >= 0, so trunc is the same as floor.
+        Sk4i iXs = SkNx_cast<int>(xs);
+        Sk4i iYs = SkNx_cast<int>(ys);
+
+        if (n >= 1) blendPixelAt(iXs[0], iYs[0]);
+        if (n >= 2) blendPixelAt(iXs[1], iYs[1]);
+        if (n >= 3) blendPixelAt(iXs[2], iYs[2]);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        SkASSERT(fDest + 4 <= fEnd);
+        Sk4i iXs = SkNx_cast<int>(xs);
+        Sk4i iYs = SkNx_cast<int>(ys);
+        blendPixelAt(iXs[0], iYs[0]);
+        blendPixelAt(iXs[1], iYs[1]);
+        blendPixelAt(iXs[2], iYs[2]);
+        blendPixelAt(iXs[3], iYs[3]);
+    }
+
+    void pointSpan(Span span) override {
+        if (span.length() != 0.0f) {
+            this->repeatSpan(span, 1);
+        }
+    }
+
+    void repeatSpan(Span span, int32_t repeatCount) override {
+        SkASSERT(fDest + span.count() * repeatCount <= fEnd);
+        SkASSERT(span.count() > 0);
+        SkASSERT(repeatCount > 0);
+
+        int32_t x = (int32_t)span.startX();
+        int32_t y = (int32_t)span.startY();
+        const uint32_t* beginSpan = this->pixelAddress(x, y);
+
+        SkOpts::srcover_srgb_srgb(fDest, beginSpan, span.count() * repeatCount, span.count());
+
+        fDest += span.count() * repeatCount;
+
+        SkASSERT(fDest <= fEnd);
+    }
+
+    void setDestination(void* dst, int count) override  {
+        SkASSERT(count > 0);
+        fDest = static_cast<uint32_t*>(dst);
+        fEnd = fDest + count;
+    }
+
+private:
+    const uint32_t* pixelAddress(int32_t x, int32_t y) {
+        return &fSrc[fWidth * y + x];
+    }
+
+    void blendPixelAt(int32_t x, int32_t y) {
+        const uint32_t* src = this->pixelAddress(x, y);
+        SkOpts::srcover_srgb_srgb(fDest, src, 1, 1);
+        fDest += 1;
+    }
+
+    const uint32_t* const fSrc;
+    const int32_t         fWidth;
+    uint32_t*             fDest;
+    uint32_t*             fEnd;
+};
+
+using Blender = SkLinearBitmapPipeline::BlendProcessorInterface;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Pixel Blender Stage
+template <SkAlphaType alphaType>
+class SrcFPPixel final : public Blender {
+public:
+    SrcFPPixel(float postAlpha) : fPostAlpha{postAlpha} { }
+    SrcFPPixel(const SrcFPPixel& Blender) : fPostAlpha(Blender.fPostAlpha) {}
+    void SK_VECTORCALL blendPixel(Sk4f pixel) override {
+        SkASSERT(fDst + 1 <= fEnd );
+        this->srcPixel(fDst, pixel, 0);
+        fDst += 1;
+    }
+
+    void SK_VECTORCALL blend4Pixels(Sk4f p0, Sk4f p1, Sk4f p2, Sk4f p3) override {
+        SkASSERT(fDst + 4 <= fEnd);
+        SkPM4f* dst = fDst;
+        this->srcPixel(dst, p0, 0);
+        this->srcPixel(dst, p1, 1);
+        this->srcPixel(dst, p2, 2);
+        this->srcPixel(dst, p3, 3);
+        fDst += 4;
+    }
+
+    void setDestination(void* dst, int count) override {
+        fDst = static_cast<SkPM4f*>(dst);
+        fEnd = fDst + count;
+    }
+
+private:
+    void SK_VECTORCALL srcPixel(SkPM4f* dst, Sk4f pixel, int index) {
+        check_pixel(pixel);
+
+        Sk4f newPixel = pixel;
+        if (alphaType == kUnpremul_SkAlphaType) {
+            newPixel = Premultiply(pixel);
+        }
+        newPixel = newPixel * fPostAlpha;
+        newPixel.store(dst + index);
+    }
+    static Sk4f SK_VECTORCALL Premultiply(Sk4f pixel) {
+        float alpha = pixel[3];
+        return pixel * Sk4f{alpha, alpha, alpha, 1.0f};
+    }
+
+    SkPM4f* fDst;
+    SkPM4f* fEnd;
+    float   fPostAlpha;
+};
+
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// SkLinearBitmapPipeline
+SkLinearBitmapPipeline::~SkLinearBitmapPipeline() {}
+
+SkLinearBitmapPipeline::SkLinearBitmapPipeline(
+    const SkMatrix& inverse,
+    SkFilterQuality filterQuality,
+    SkShader::TileMode xTile, SkShader::TileMode yTile,
+    SkColor paintColor,
+    const SkPixmap& srcPixmap,
+    SkArenaAlloc* allocator)
+{
+    SkISize dimensions = srcPixmap.info().dimensions();
+    const SkImageInfo& srcImageInfo = srcPixmap.info();
+
+    SkMatrix adjustedInverse = inverse;
+    if (filterQuality == kNone_SkFilterQuality) {
+        if (inverse.getScaleX() >= 0.0f) {
+            adjustedInverse.setTranslateX(
+                nextafterf(inverse.getTranslateX(), std::floor(inverse.getTranslateX())));
+        }
+        if (inverse.getScaleY() >= 0.0f) {
+            adjustedInverse.setTranslateY(
+                nextafterf(inverse.getTranslateY(), std::floor(inverse.getTranslateY())));
+        }
+    }
+
+    SkScalar dx = adjustedInverse.getScaleX();
+
+    // If it is an index 8 color type, the sampler converts to unpremul for better fidelity.
+    SkAlphaType alphaType = srcImageInfo.alphaType();
+    if (srcPixmap.colorType() == kIndex_8_SkColorType) {
+        alphaType = kUnpremul_SkAlphaType;
+    }
+
+    float postAlpha = SkColorGetA(paintColor) * (1.0f / 255.0f);
+    // As the stages are built, the chooser function may skip a stage. For example, with the
+    // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
+    auto blenderStage = this->chooseBlenderForShading(alphaType, postAlpha, allocator);
+    auto samplerStage = this->chooseSampler(
+        blenderStage, filterQuality, xTile, yTile, srcPixmap, paintColor, allocator);
+    auto tilerStage   = this->chooseTiler(
+        samplerStage, dimensions, xTile, yTile, filterQuality, dx, allocator);
+    fFirstStage       = this->chooseMatrix(tilerStage, adjustedInverse, allocator);
+    fLastStage        = blenderStage;
+}
+
+SkLinearBitmapPipeline::SkLinearBitmapPipeline(
+    const SkLinearBitmapPipeline& pipeline,
+    const SkPixmap& srcPixmap,
+    SkBlendMode mode,
+    const SkImageInfo& dstInfo,
+    SkArenaAlloc* allocator)
+{
+    SkASSERT(mode == SkBlendMode::kSrc || mode == SkBlendMode::kSrcOver);
+    SkASSERT(srcPixmap.info().colorType() == dstInfo.colorType()
+             && srcPixmap.info().colorType() == kRGBA_8888_SkColorType);
+
+    SampleProcessorInterface* sampleStage;
+    if (mode == SkBlendMode::kSrc) {
+        auto sampler = allocator->make<RGBA8888UnitRepeatSrc>(
+            srcPixmap.writable_addr32(0, 0), srcPixmap.rowBytes() / 4);
+        sampleStage = sampler;
+        fLastStage = sampler;
+    } else {
+        auto sampler = allocator->make<RGBA8888UnitRepeatSrcOver>(
+            srcPixmap.writable_addr32(0, 0), srcPixmap.rowBytes() / 4);
+        sampleStage = sampler;
+        fLastStage = sampler;
+    }
+
+    auto tilerStage = pipeline.fTileStageCloner(sampleStage, allocator);
+    auto matrixStage = pipeline.fMatrixStageCloner(tilerStage, allocator);
+    fFirstStage = matrixStage;
+}
+
+void SkLinearBitmapPipeline::shadeSpan4f(int x, int y, SkPM4f* dst, int count) {
+    SkASSERT(count > 0);
+    this->blitSpan(x, y, dst, count);
+}
+
+void SkLinearBitmapPipeline::blitSpan(int x, int y, void* dst, int count) {
+    SkASSERT(count > 0);
+    fLastStage->setDestination(dst, count);
+
+    // The count and length arguments start out in a precise relation in order to keep the
+    // math correct through the different stages. Count is the number of pixel to produce.
+    // Since the code samples at pixel centers, length is the distance from the center of the
+    // first pixel to the center of the last pixel. This implies that length is count-1.
+    fFirstStage->pointSpan(Span{{x + 0.5f, y + 0.5f}, count - 1.0f, count});
+}
+
+SkLinearBitmapPipeline::PointProcessorInterface*
+SkLinearBitmapPipeline::chooseMatrix(
+    PointProcessorInterface* next,
+    const SkMatrix& inverse,
+    SkArenaAlloc* allocator)
+{
+    if (inverse.hasPerspective()) {
+        auto matrixStage = allocator->make<PerspectiveMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()},
+            SkVector{inverse.getPerspX(), inverse.getPerspY()},
+            inverse.get(SkMatrix::kMPersp2));
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, SkArenaAlloc* memory) {
+                return memory->make<PerspectiveMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getSkewX() != 0.0f || inverse.getSkewY() != 0.0f) {
+        auto matrixStage = allocator->make<AffineMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, SkArenaAlloc* memory) {
+                return memory->make<AffineMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getScaleX() != 1.0f || inverse.getScaleY() != 1.0f) {
+        auto matrixStage = allocator->make<ScaleMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, SkArenaAlloc* memory) {
+                return memory->make<ScaleMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else if (inverse.getTranslateX() != 0.0f || inverse.getTranslateY() != 0.0f) {
+        auto matrixStage = allocator->make<TranslateMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()});
+        fMatrixStageCloner =
+            [matrixStage](PointProcessorInterface* cloneNext, SkArenaAlloc* memory) {
+                return memory->make<TranslateMatrix<>>(cloneNext, matrixStage);
+            };
+        return matrixStage;
+    } else {
+        fMatrixStageCloner = [](PointProcessorInterface* cloneNext, SkArenaAlloc* memory) {
+            return cloneNext;
+        };
+        return next;
+    }
+}
+
+template <typename Tiler>
+SkLinearBitmapPipeline::PointProcessorInterface* SkLinearBitmapPipeline::createTiler(
+    SampleProcessorInterface* next,
+    SkISize dimensions,
+    SkArenaAlloc* allocator)
+{
+    auto tilerStage = allocator->make<Tiler>(next, dimensions);
+    fTileStageCloner =
+        [tilerStage](SampleProcessorInterface* cloneNext,
+                     SkArenaAlloc* memory) -> PointProcessorInterface* {
+            return memory->make<Tiler>(cloneNext, tilerStage);
+        };
+    return tilerStage;
+}
+
+template <typename XStrategy>
+SkLinearBitmapPipeline::PointProcessorInterface* SkLinearBitmapPipeline::chooseTilerYMode(
+    SampleProcessorInterface* next,
+    SkShader::TileMode yMode,
+    SkISize dimensions,
+    SkArenaAlloc* allocator)
+{
+    switch (yMode) {
+        case SkShader::kClamp_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YClampStrategy, SampleProcessorInterface>;
+            return this->createTiler<Tiler>(next, dimensions, allocator);
+        }
+        case SkShader::kRepeat_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YRepeatStrategy, SampleProcessorInterface>;
+            return this->createTiler<Tiler>(next, dimensions, allocator);
+        }
+        case SkShader::kMirror_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YMirrorStrategy, SampleProcessorInterface>;
+            return this->createTiler<Tiler>(next, dimensions, allocator);
+        }
+    }
+
+    // Should never get here.
+    SkFAIL("Not all Y tile cases covered.");
+    return nullptr;
+}
+
+SkLinearBitmapPipeline::PointProcessorInterface* SkLinearBitmapPipeline::chooseTiler(
+    SampleProcessorInterface* next,
+    SkISize dimensions,
+    SkShader::TileMode xMode,
+    SkShader::TileMode yMode,
+    SkFilterQuality filterQuality,
+    SkScalar dx,
+    SkArenaAlloc* allocator)
+{
+    switch (xMode) {
+        case SkShader::kClamp_TileMode:
+            return this->chooseTilerYMode<XClampStrategy>(next, yMode, dimensions, allocator);
+        case SkShader::kRepeat_TileMode:
+            if (dx == 1.0f && filterQuality == kNone_SkFilterQuality) {
+                return this->chooseTilerYMode<XRepeatUnitScaleStrategy>(
+                    next, yMode, dimensions, allocator);
+            } else {
+                return this->chooseTilerYMode<XRepeatStrategy>(
+                    next, yMode, dimensions, allocator);
+            }
+        case SkShader::kMirror_TileMode:
+            return this->chooseTilerYMode<XMirrorStrategy>(next, yMode, dimensions, allocator);
+    }
+
+    // Should never get here.
+    SkFAIL("Not all X tile cases covered.");
+    return nullptr;
+}
+
+template <SkColorType colorType>
+SkLinearBitmapPipeline::PixelAccessorInterface*
+    SkLinearBitmapPipeline::chooseSpecificAccessor(
+    const SkPixmap& srcPixmap,
+    SkArenaAlloc* allocator)
+{
+    if (srcPixmap.info().gammaCloseToSRGB()) {
+        using Accessor = PixelAccessor<colorType, kSRGB_SkGammaType>;
+        return allocator->make<Accessor>(srcPixmap);
+    } else {
+        using Accessor = PixelAccessor<colorType, kLinear_SkGammaType>;
+        return allocator->make<Accessor>(srcPixmap);
+    }
+}
+
+SkLinearBitmapPipeline::PixelAccessorInterface* SkLinearBitmapPipeline::choosePixelAccessor(
+    const SkPixmap& srcPixmap,
+    const SkColor A8TintColor,
+    SkArenaAlloc* allocator)
+{
+    const SkImageInfo& imageInfo = srcPixmap.info();
+
+    switch (imageInfo.colorType()) {
+        case kAlpha_8_SkColorType: {
+            using Accessor = PixelAccessor<kAlpha_8_SkColorType, kLinear_SkGammaType>;
+            return allocator->make<Accessor>(srcPixmap, A8TintColor);
+        }
+        case kARGB_4444_SkColorType:
+            return this->chooseSpecificAccessor<kARGB_4444_SkColorType>(srcPixmap, allocator);
+        case kRGB_565_SkColorType:
+            return this->chooseSpecificAccessor<kRGB_565_SkColorType>(srcPixmap, allocator);
+        case kRGBA_8888_SkColorType:
+            return this->chooseSpecificAccessor<kRGBA_8888_SkColorType>(srcPixmap, allocator);
+        case kBGRA_8888_SkColorType:
+            return this->chooseSpecificAccessor<kBGRA_8888_SkColorType>(srcPixmap, allocator);
+        case kIndex_8_SkColorType:
+            return this->chooseSpecificAccessor<kIndex_8_SkColorType>(srcPixmap, allocator);
+        case kGray_8_SkColorType:
+            return this->chooseSpecificAccessor<kGray_8_SkColorType>(srcPixmap, allocator);
+        case kRGBA_F16_SkColorType: {
+            using Accessor = PixelAccessor<kRGBA_F16_SkColorType, kLinear_SkGammaType>;
+            return allocator->make<Accessor>(srcPixmap);
+        }
+        default:
+            // Should never get here.
+            SkFAIL("Pixel source not supported.");
+            return nullptr;
+    }
+}
+
+SkLinearBitmapPipeline::SampleProcessorInterface* SkLinearBitmapPipeline::chooseSampler(
+    Blender* next,
+    SkFilterQuality filterQuality,
+    SkShader::TileMode xTile, SkShader::TileMode yTile,
+    const SkPixmap& srcPixmap,
+    const SkColor A8TintColor,
+    SkArenaAlloc* allocator)
+{
+    const SkImageInfo& imageInfo = srcPixmap.info();
+    SkISize dimensions = imageInfo.dimensions();
+
+    // Special case samplers with fully expanded templates
+    if (imageInfo.gammaCloseToSRGB()) {
+        if (filterQuality == kNone_SkFilterQuality) {
+            switch (imageInfo.colorType()) {
+                case kN32_SkColorType: {
+                    using Sampler =
+                    NearestNeighborSampler<
+                        PixelAccessor<kN32_SkColorType, kSRGB_SkGammaType>, Blender>;
+                    return allocator->make<Sampler>(next, srcPixmap);
+                }
+                case kIndex_8_SkColorType: {
+                    using Sampler =
+                    NearestNeighborSampler<
+                        PixelAccessor<kIndex_8_SkColorType, kSRGB_SkGammaType>, Blender>;
+                    return allocator->make<Sampler>(next, srcPixmap);
+                }
+                default:
+                    break;
+            }
+        } else {
+            switch (imageInfo.colorType()) {
+                case kN32_SkColorType: {
+                    using Sampler =
+                    BilerpSampler<
+                        PixelAccessor<kN32_SkColorType, kSRGB_SkGammaType>, Blender>;
+                    return allocator->make<Sampler>(next, dimensions, xTile, yTile, srcPixmap);
+                }
+                case kIndex_8_SkColorType: {
+                    using Sampler =
+                    BilerpSampler<
+                        PixelAccessor<kIndex_8_SkColorType, kSRGB_SkGammaType>, Blender>;
+                    return allocator->make<Sampler>(next, dimensions, xTile, yTile, srcPixmap);
+                }
+                default:
+                    break;
+            }
+        }
+    }
+
+    auto pixelAccessor = this->choosePixelAccessor(srcPixmap, A8TintColor, allocator);
+    // General cases.
+    if (filterQuality == kNone_SkFilterQuality) {
+        using Sampler = NearestNeighborSampler<PixelAccessorShim, Blender>;
+        return allocator->make<Sampler>(next, pixelAccessor);
+    } else {
+        using Sampler = BilerpSampler<PixelAccessorShim, Blender>;
+        return allocator->make<Sampler>(next, dimensions, xTile, yTile, pixelAccessor);
+    }
+}
+
+Blender* SkLinearBitmapPipeline::chooseBlenderForShading(
+    SkAlphaType alphaType,
+    float postAlpha,
+    SkArenaAlloc* allocator)
+{
+    if (alphaType == kUnpremul_SkAlphaType) {
+        return allocator->make<SrcFPPixel<kUnpremul_SkAlphaType>>(postAlpha);
+    } else {
+        // kOpaque_SkAlphaType is treated the same as kPremul_SkAlphaType
+        return allocator->make<SrcFPPixel<kPremul_SkAlphaType>>(postAlpha);
+    }
+}
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
new file mode 100644
index 0000000000..6f6e2ae602
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_DEFINED
+#define SkLinearBitmapPipeline_DEFINED
+
+#include "SkArenaAlloc.h"
+#include "SkColor.h"
+#include "SkImageInfo.h"
+#include "SkMatrix.h"
+#include "SkShader.h"
+
+class SkEmbeddableLinearPipeline;
+
+enum SkGammaType {
+    kLinear_SkGammaType,
+    kSRGB_SkGammaType,
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// SkLinearBitmapPipeline - encapsulates all the machinery for doing floating point pixel
+// processing in a linear color space.
+// Note: this class has unusual alignment requirements due to its use of SIMD instructions. The
+// class SkEmbeddableLinearPipeline below manages these requirements.
+class SkLinearBitmapPipeline {
+public:
+    SkLinearBitmapPipeline(
+        const SkMatrix& inverse,
+        SkFilterQuality filterQuality,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        SkColor paintColor,
+        const SkPixmap& srcPixmap,
+        SkArenaAlloc* allocator);
+
+    SkLinearBitmapPipeline(
+        const SkLinearBitmapPipeline& pipeline,
+        const SkPixmap& srcPixmap,
+        SkBlendMode,
+        const SkImageInfo& dstInfo,
+        SkArenaAlloc* allocator);
+
+    ~SkLinearBitmapPipeline();
+
+    void shadeSpan4f(int x, int y, SkPM4f* dst, int count);
+    void blitSpan(int32_t x, int32_t y, void* dst, int count);
+
+    class PointProcessorInterface;
+    class SampleProcessorInterface;
+    class BlendProcessorInterface;
+    class DestinationInterface;
+    class PixelAccessorInterface;
+
+    using MatrixCloner =
+        std::function<PointProcessorInterface* (PointProcessorInterface*, SkArenaAlloc*)>;
+    using TilerCloner =
+        std::function<PointProcessorInterface* (SampleProcessorInterface*, SkArenaAlloc*)>;
+
+    PointProcessorInterface* chooseMatrix(
+        PointProcessorInterface* next,
+        const SkMatrix& inverse,
+        SkArenaAlloc* allocator);
+
+    template <typename Tiler>
+    PointProcessorInterface* createTiler(SampleProcessorInterface* next, SkISize dimensions,
+                                         SkArenaAlloc* allocator);
+
+    template <typename XStrategy>
+    PointProcessorInterface* chooseTilerYMode(
+        SampleProcessorInterface* next, SkShader::TileMode yMode, SkISize dimensions,
+        SkArenaAlloc* allocator);
+
+    PointProcessorInterface* chooseTiler(
+        SampleProcessorInterface* next,
+        SkISize dimensions,
+        SkShader::TileMode xMode, SkShader::TileMode yMode,
+        SkFilterQuality filterQuality,
+        SkScalar dx,
+        SkArenaAlloc* allocator);
+
+    template <SkColorType colorType>
+    PixelAccessorInterface* chooseSpecificAccessor(const SkPixmap& srcPixmap,
+                                                   SkArenaAlloc* allocator);
+
+    PixelAccessorInterface* choosePixelAccessor(
+        const SkPixmap& srcPixmap,
+        const SkColor A8TintColor,
+        SkArenaAlloc* allocator);
+
+    SampleProcessorInterface* chooseSampler(
+        BlendProcessorInterface* next,
+        SkFilterQuality filterQuality,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        const SkPixmap& srcPixmap,
+        const SkColor A8TintColor,
+        SkArenaAlloc* allocator);
+
+    BlendProcessorInterface* chooseBlenderForShading(
+        SkAlphaType alphaType,
+        float postAlpha,
+        SkArenaAlloc* allocator);
+
+    PointProcessorInterface* fFirstStage;
+    MatrixCloner             fMatrixStageCloner;
+    TilerCloner              fTileStageCloner;
+    DestinationInterface*    fLastStage;
+};
+
+#endif  // SkLinearBitmapPipeline_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_core.h b/src/core/SkLinearBitmapPipeline_core.h
new file mode 100644
index 0000000000..ce6c05b752
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline_core.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_core_DEFINED
+#define SkLinearBitmapPipeline_core_DEFINED
+
+#include <algorithm>
+#include <cmath>
+#include "SkNx.h"
+
+// New bilerp strategy:
+// Pass through on bilerpList4 and bilerpListFew (analogs to pointList), introduce bilerpEdge
+// which takes 4 points. If the sample spans an edge, then break it into a bilerpEdge. Bilerp
+// span then becomes a normal span except in special cases where an extra Y is given. The bilerp
+// need to stay single point calculations until the tile layer.
+// TODO:
+//  - edge span predicate.
+//  - introduce new point API
+//  - Add tile for new api.
+
+namespace {
+struct X {
+    explicit X(SkScalar val) : fVal{val} { }
+    explicit X(SkPoint pt)   : fVal{pt.fX} { }
+    explicit X(SkSize s)     : fVal{s.fWidth} { }
+    explicit X(SkISize s)    : fVal((SkScalar)s.fWidth) { }
+    operator SkScalar () const {return fVal;}
+private:
+    SkScalar fVal;
+};
+
+struct Y {
+    explicit Y(SkScalar val) : fVal{val} { }
+    explicit Y(SkPoint pt)   : fVal{pt.fY} { }
+    explicit Y(SkSize s)     : fVal{s.fHeight} { }
+    explicit Y(SkISize s)    : fVal((SkScalar)s.fHeight) { }
+    operator SkScalar () const {return fVal;}
+private:
+    SkScalar fVal;
+};
+
+// The Span class enables efficient processing horizontal spans of pixels.
+// * start - the point where to start the span.
+// * length - the number of pixels to traverse in source space.
+// * count - the number of pixels to produce in destination space.
+// Both start and length are mapped through the inversion matrix to produce values in source
+// space. After the matrix operation, the tilers may break the spans up into smaller spans.
+// The tilers can produce spans that seem nonsensical.
+// * The clamp tiler can create spans with length of 0. This indicates to copy an edge pixel out
+//   to the edge of the destination scan.
+// * The mirror tiler can produce spans with negative length. This indicates that the source
+//   should be traversed in the opposite direction to the destination pixels.
+class Span {
+public:
+    Span(SkPoint start, SkScalar length, int count)
+        : fStart(start)
+        , fLength(length)
+        , fCount{count} {
+        SkASSERT(std::isfinite(length));
+    }
+
+    operator std::tuple<SkPoint&, SkScalar&, int&>() {
+        return std::tie(fStart, fLength, fCount);
+    }
+
+    bool isEmpty() const { return 0 == fCount; }
+    void clear() { fCount = 0; }
+    int count() const { return fCount; }
+    SkScalar length() const { return fLength; }
+    SkScalar startX() const { return X(fStart); }
+    SkScalar endX() const { return this->startX() + this->length(); }
+    SkScalar startY() const { return Y(fStart); }
+    Span emptySpan() { return Span{{0.0, 0.0}, 0.0f, 0}; }
+
+    bool completelyWithin(SkScalar xMin, SkScalar xMax) const {
+        SkScalar sMin, sMax;
+        std::tie(sMin, sMax) = std::minmax(startX(), endX());
+        return xMin <= sMin && sMax < xMax;
+    }
+
+    void offset(SkScalar offsetX) {
+        fStart.offset(offsetX, 0.0f);
+    }
+
+    Span breakAt(SkScalar breakX, SkScalar dx) {
+        SkASSERT(std::isfinite(breakX));
+        SkASSERT(std::isfinite(dx));
+        SkASSERT(dx != 0.0f);
+
+        if (this->isEmpty()) {
+            return this->emptySpan();
+        }
+
+        int dxSteps = SkScalarFloorToInt((breakX - this->startX()) / dx);
+
+        if (dxSteps < 0) {
+            // The span is wholly after breakX.
+            return this->emptySpan();
+        } else if (dxSteps >= fCount) {
+            // The span is wholly before breakX.
+            Span answer = *this;
+            this->clear();
+            return answer;
+        }
+
+        // Calculate the values for the span to cleave off.
+        SkScalar newLength = dxSteps * dx;
+
+        // If the last (or first if count = 1) sample lands directly on the boundary. Include it
+        // when dx < 0 and exclude it when dx > 0.
+        // Reasoning:
+        //  dx > 0: The sample point on the boundary is part of the next span because the entire
+        // pixel is after the boundary.
+        //  dx < 0: The sample point on the boundary is part of the current span because the
+        // entire pixel is before the boundary.
+        if (this->startX() + newLength == breakX && dx > 0) {
+            if (dxSteps > 0) {
+                dxSteps -= 1;
+                newLength -= dx;
+            } else {
+                return this->emptySpan();
+            }
+        }
+
+        // Calculate new span parameters
+        SkPoint newStart = fStart;
+        int newCount = dxSteps + 1;
+        SkASSERT(newCount > 0);
+
+        // Update this span to reflect the break.
+        SkScalar lengthToStart = newLength + dx;
+        fLength -= lengthToStart;
+        fCount -= newCount;
+        fStart = {this->startX() + lengthToStart, Y(fStart)};
+
+        return Span{newStart, newLength, newCount};
+    }
+
+    void clampToSinglePixel(SkPoint pixel) {
+        fStart = pixel;
+        fLength = 0.0f;
+    }
+
+private:
+    SkPoint  fStart;
+    SkScalar fLength;
+    int      fCount;
+};
+
+template<typename Stage>
+void span_fallback(Span span, Stage* stage) {
+    SkPoint start;
+    SkScalar length;
+    int count;
+    std::tie(start, length, count) = span;
+    Sk4f startXs{X(start)};
+    Sk4f ys{Y(start)};
+    Sk4f mults = {0.0f, 1.0f, 2.0f, 3.0f};
+
+    // Initializing this is not needed, but some compilers can't figure this out.
+    Sk4s dXs{0.0f};
+    if (count > 1) {
+        SkScalar dx = length / (count - 1);
+        dXs = Sk4f{dx};
+    }
+
+    // Instead of using xs = xs + dx every round, this uses xs = i * dx + X(start). This
+    // eliminates the rounding error for the sum.
+    Sk4f xs = startXs + mults * dXs;
+    while (count >= 4) {
+        stage->pointList4(xs, ys);
+
+        mults += Sk4f{4.0f};
+        xs = mults * dXs + startXs;
+        count -= 4;
+    }
+
+    if (count > 0) {
+        stage->pointListFew(count, xs, ys);
+    }
+}
+
+inline Sk4f SK_VECTORCALL check_pixel(const Sk4f& pixel) {
+    SkASSERTF(0.0f <= pixel[0] && pixel[0] <= 1.0f, "pixel[0]: %f", pixel[0]);
+    SkASSERTF(0.0f <= pixel[1] && pixel[1] <= 1.0f, "pixel[1]: %f", pixel[1]);
+    SkASSERTF(0.0f <= pixel[2] && pixel[2] <= 1.0f, "pixel[2]: %f", pixel[2]);
+    SkASSERTF(0.0f <= pixel[3] && pixel[3] <= 1.0f, "pixel[3]: %f", pixel[3]);
+    return pixel;
+}
+
+}  // namespace
+
+class SkLinearBitmapPipeline::PointProcessorInterface {
+public:
+    virtual ~PointProcessorInterface() { }
+    // Take the first n (where 0 < n && n < 4) items from xs and ys and sample those points. For
+    // nearest neighbor, that means just taking the floor xs and ys. For bilerp, this means
+    // to expand the bilerp filter around the point and sample using that filter.
+    virtual void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) = 0;
+    // Same as pointListFew, but n = 4.
+    virtual void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) = 0;
+    // A span is a compact form of sample points that are obtained by mapping points from
+    // destination space to source space. This is used for horizontal lines only, and is mainly
+    // used to take advantage of memory coherence for horizontal spans.
+    virtual void pointSpan(Span span) = 0;
+};
+
+class SkLinearBitmapPipeline::SampleProcessorInterface
+    : public SkLinearBitmapPipeline::PointProcessorInterface {
+public:
+    // Used for nearest neighbor when scale factor is 1.0. The span can just be repeated with no
+    // edge pixel alignment problems. This is for handling a very common case.
+    virtual void repeatSpan(Span span, int32_t repeatCount) = 0;
+};
+
+class SkLinearBitmapPipeline::DestinationInterface {
+public:
+    virtual ~DestinationInterface() { }
+    // Count is normally not needed, but in these early stages of development it is useful to
+    // check bounds.
+    // TODO(herb): 4/6/2016 - remove count when code is stable.
+    virtual void setDestination(void* dst, int count) = 0;
+};
+
+class SkLinearBitmapPipeline::BlendProcessorInterface
+    : public SkLinearBitmapPipeline::DestinationInterface {
+public:
+    virtual void SK_VECTORCALL blendPixel(Sk4f pixel0) = 0;
+    virtual void SK_VECTORCALL blend4Pixels(Sk4f p0, Sk4f p1, Sk4f p2, Sk4f p3) = 0;
+};
+
+class SkLinearBitmapPipeline::PixelAccessorInterface {
+public:
+    virtual ~PixelAccessorInterface() { }
+    virtual void SK_VECTORCALL getFewPixels(
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
+
+    virtual void SK_VECTORCALL get4Pixels(
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
+
+    virtual void get4Pixels(
+        const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
+
+    virtual Sk4f getPixelFromRow(const void* row, int index) const = 0;
+
+    virtual Sk4f getPixelAt(int index) const = 0;
+
+    virtual const void* row(int y) const = 0;
+};
+
+#endif // SkLinearBitmapPipeline_core_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_matrix.h b/src/core/SkLinearBitmapPipeline_matrix.h
new file mode 100644
index 0000000000..78f723148e
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline_matrix.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_matrix_DEFINED
+#define SkLinearBitmapPipeline_matrix_DEFINED
+
+#include "SkLinearBitmapPipeline_core.h"
+
+namespace {
+class TranslateMatrixStrategy {
+public:
+    TranslateMatrixStrategy(SkVector offset)
+        : fXOffset{X(offset)}
+        , fYOffset{Y(offset)} { }
+
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
+        *xs = *xs + fXOffset;
+        *ys = *ys + fYOffset;
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span span, Next* next) const {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        next->pointSpan(Span{start + SkPoint{fXOffset, fYOffset}, length, count});
+        return true;
+    }
+
+private:
+    const SkScalar fXOffset, fYOffset;
+};
+
+class ScaleMatrixStrategy {
+public:
+    ScaleMatrixStrategy(SkVector offset, SkVector scale)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}
+        ,  fXScale{X(scale)},   fYScale{Y(scale)} { }
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
+        *xs = *xs * fXScale + fXOffset;
+        *ys = *ys * fYScale + fYOffset;
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span span, Next* next) const {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkPoint newStart =
+            SkPoint{X(start) * fXScale + fXOffset, Y(start) * fYScale + fYOffset};
+        SkScalar newLength = length * fXScale;
+        next->pointSpan(Span{newStart, newLength, count});
+        return true;
+    }
+
+private:
+    const SkScalar fXOffset, fYOffset;
+    const SkScalar fXScale,  fYScale;
+};
+
+class AffineMatrixStrategy {
+public:
+    AffineMatrixStrategy(SkVector offset, SkVector scale, SkVector skew)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}
+        , fXScale{X(scale)},   fYScale{Y(scale)}
+        , fXSkew{X(skew)},     fYSkew{Y(skew)} { }
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
+        Sk4s newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
+        Sk4s newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
+
+        *xs = newXs;
+        *ys = newYs;
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span span, Next* next) const {
+        return false;
+    }
+
+private:
+    const SkScalar fXOffset, fYOffset;
+    const SkScalar fXScale,  fYScale;
+    const SkScalar fXSkew,   fYSkew;
+};
+
+class PerspectiveMatrixStrategy {
+public:
+    PerspectiveMatrixStrategy(SkVector offset, SkVector scale, SkVector skew,
+                              SkVector zSkew, SkScalar zOffset)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}, fZOffset{zOffset}
+        , fXScale{X(scale)},   fYScale{Y(scale)}
+        , fXSkew{X(skew)},     fYSkew{Y(skew)}, fZXSkew{X(zSkew)}, fZYSkew{Y(zSkew)} { }
+    void processPoints(Sk4s* xs, Sk4s* ys) const {
+        Sk4s newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
+        Sk4s newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
+        Sk4s newZs =  fZXSkew * *xs + fZYSkew * *ys + fZOffset;
+
+        *xs = newXs / newZs;
+        *ys = newYs / newZs;
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span span, Next* next) const {
+        return false;
+    }
+
+private:
+    const SkScalar fXOffset, fYOffset, fZOffset;
+    const SkScalar fXScale,  fYScale;
+    const SkScalar fXSkew,   fYSkew,   fZXSkew, fZYSkew;
+};
+
+
+}  // namespace
+
+#endif  // SkLinearBitmapPipeline_matrix_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_sample.h b/src/core/SkLinearBitmapPipeline_sample.h
new file mode 100644
index 0000000000..a7f5d7383e
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline_sample.h
@@ -0,0 +1,1041 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_sampler_DEFINED
+#define SkLinearBitmapPipeline_sampler_DEFINED
+
+#include <tuple>
+
+#include "SkAutoMalloc.h"
+#include "SkColor.h"
+#include "SkColorPriv.h"
+#include "SkFixed.h"  // for SkFixed1 only. Don't use SkFixed in this file.
+#include "SkHalf.h"
+#include "SkLinearBitmapPipeline_core.h"
+#include "SkNx.h"
+#include "SkPM4fPriv.h"
+
+namespace {
+// Explaination of the math:
+//              1 - x      x
+//           +--------+--------+
+//           |        |        |
+//  1 - y    |  px00  |  px10  |
+//           |        |        |
+//           +--------+--------+
+//           |        |        |
+//    y      |  px01  |  px11  |
+//           |        |        |
+//           +--------+--------+
+//
+//
+// Given a pixelxy each is multiplied by a different factor derived from the fractional part of x
+// and y:
+// * px00 -> (1 - x)(1 - y) = 1 - x - y + xy
+// * px10 -> x(1 - y) = x - xy
+// * px01 -> (1 - x)y = y - xy
+// * px11 -> xy
+// So x * y is calculated first and then used to calculate all the other factors.
+static Sk4s SK_VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
+                                                    Sk4f px01, Sk4f px11) {
+    // Calculate fractional xs and ys.
+    Sk4s fxs = xs - xs.floor();
+    Sk4s fys = ys - ys.floor();
+    Sk4s fxys{fxs * fys};
+    Sk4f sum = px11 * fxys;
+    sum = sum + px01 * (fys - fxys);
+    sum = sum + px10 * (fxs - fxys);
+    sum = sum + px00 * (Sk4f{1.0f} - fxs - fys + fxys);
+    return sum;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// PixelGetter is the lowest level interface to the source data. There is a PixelConverter for each
+// of the different SkColorTypes.
+template <SkColorType, SkGammaType> class PixelConverter;
+
+// Alpha handling:
+//   The alpha from the paint (tintColor) is used in the blend part of the pipeline to modulate
+// the entire bitmap. So, the tint color is given an alpha of 1.0 so that the later alpha can
+// modulate this color later.
+template <>
+class PixelConverter<kAlpha_8_SkColorType, kLinear_SkGammaType> {
+public:
+    using Element = uint8_t;
+    PixelConverter(const SkPixmap& srcPixmap, SkColor tintColor) {
+        fTintColor = SkColor4f::FromColor(tintColor);
+        fTintColor.fA = 1.0f;
+    }
+
+    Sk4f toSk4f(const Element pixel) const {
+        return Sk4f::Load(&fTintColor) * (pixel * (1.0f/255.0f));
+    }
+
+private:
+    SkColor4f fTintColor;
+};
+
+template <SkGammaType gammaType>
+static inline Sk4f pmcolor_to_rgba(SkPMColor pixel) {
+    return swizzle_rb_if_bgra(
+            (gammaType == kSRGB_SkGammaType) ? Sk4f_fromS32(pixel)
+                                             : Sk4f_fromL32(pixel));
+}
+
+template <SkGammaType gammaType>
+class PixelConverter<kRGB_565_SkColorType, gammaType> {
+public:
+    using Element = uint16_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(Element pixel) const {
+        return pmcolor_to_rgba<gammaType>(SkPixel16ToPixel32(pixel));
+    }
+};
+
+template <SkGammaType gammaType>
+class PixelConverter<kARGB_4444_SkColorType, gammaType> {
+public:
+    using Element = uint16_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(Element pixel) const {
+        return pmcolor_to_rgba<gammaType>(SkPixel4444ToPixel32(pixel));
+    }
+};
+
+template <SkGammaType gammaType>
+class PixelConverter<kRGBA_8888_SkColorType, gammaType> {
+public:
+    using Element = uint32_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(Element pixel) const {
+        return gammaType == kSRGB_SkGammaType
+               ? Sk4f_fromS32(pixel)
+               : Sk4f_fromL32(pixel);
+    }
+};
+
+template <SkGammaType gammaType>
+class PixelConverter<kBGRA_8888_SkColorType, gammaType> {
+public:
+    using Element = uint32_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(Element pixel) const {
+        return swizzle_rb(
+                   gammaType == kSRGB_SkGammaType ? Sk4f_fromS32(pixel) : Sk4f_fromL32(pixel));
+    }
+};
+
+template <SkGammaType gammaType>
+class PixelConverter<kIndex_8_SkColorType, gammaType> {
+public:
+    using Element = uint8_t;
+    PixelConverter(const SkPixmap& srcPixmap)
+    : fColorTableSize(srcPixmap.ctable()->count()){
+        SkColorTable* skColorTable = srcPixmap.ctable();
+        SkASSERT(skColorTable != nullptr);
+
+        fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
+        for (int i = 0; i < fColorTableSize; i++) {
+            fColorTable[i] = pmcolor_to_rgba<gammaType>((*skColorTable)[i]);
+        }
+    }
+
+    PixelConverter(const PixelConverter& strategy)
+    : fColorTableSize{strategy.fColorTableSize}{
+        fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
+        for (int i = 0; i < fColorTableSize; i++) {
+            fColorTable[i] = strategy.fColorTable[i];
+        }
+    }
+
+    Sk4f toSk4f(Element index) const {
+        return fColorTable[index];
+    }
+
+private:
+    static const size_t kColorTableSize = sizeof(Sk4f[256]) + 12;
+    const int           fColorTableSize;
+    SkAutoMalloc        fColorTableStorage{kColorTableSize};
+    Sk4f*               fColorTable;
+};
+
+template <SkGammaType gammaType>
+class PixelConverter<kGray_8_SkColorType, gammaType> {
+public:
+    using Element = uint8_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(Element pixel) const {
+        float gray = (gammaType == kSRGB_SkGammaType)
+            ? sk_linear_from_srgb[pixel]
+            : pixel * (1/255.0f);
+        return {gray, gray, gray, 1.0f};
+    }
+};
+
+template <>
+class PixelConverter<kRGBA_F16_SkColorType, kLinear_SkGammaType> {
+public:
+    using Element = uint64_t;
+    PixelConverter(const SkPixmap& srcPixmap) { }
+
+    Sk4f toSk4f(const Element pixel) const {
+        return SkHalfToFloat_finite_ftz(pixel);
+    }
+};
+
+class PixelAccessorShim {
+public:
+    explicit PixelAccessorShim(SkLinearBitmapPipeline::PixelAccessorInterface* accessor)
+        : fPixelAccessor(accessor) { }
+
+    void SK_VECTORCALL getFewPixels(
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
+        fPixelAccessor->getFewPixels(n, xs, ys, px0, px1, px2);
+    }
+
+    void SK_VECTORCALL get4Pixels(
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
+        fPixelAccessor->get4Pixels(xs, ys, px0, px1, px2, px3);
+    }
+
+    void get4Pixels(
+        const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
+        fPixelAccessor->get4Pixels(src, index, px0, px1, px2, px3);
+    }
+
+    Sk4f getPixelFromRow(const void* row, int index) const {
+        return fPixelAccessor->getPixelFromRow(row, index);
+    }
+
+    Sk4f getPixelAt(int index) const {
+        return fPixelAccessor->getPixelAt(index);
+    }
+
+    const void* row(int y) const {
+        return fPixelAccessor->row(y);
+    }
+
+private:
+    SkLinearBitmapPipeline::PixelAccessorInterface* const fPixelAccessor;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// PixelAccessor handles all the same plumbing for all the PixelGetters.
+template <SkColorType colorType, SkGammaType gammaType>
+class PixelAccessor final : public SkLinearBitmapPipeline::PixelAccessorInterface {
+    using Element = typename PixelConverter<colorType, gammaType>::Element;
+public:
+    template <typename... Args>
+    PixelAccessor(const SkPixmap& srcPixmap, Args&&... args)
+        : fSrc{static_cast<const Element*>(srcPixmap.addr())}
+        , fWidth{srcPixmap.rowBytesAsPixels()}
+        , fConverter{srcPixmap, std::move<Args>(args)...} { }
+
+    void SK_VECTORCALL getFewPixels (
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
+        switch (n) {
+            case 3:
+                *px2 = this->getPixelAt(bufferLoc[2]);
+            case 2:
+                *px1 = this->getPixelAt(bufferLoc[1]);
+            case 1:
+                *px0 = this->getPixelAt(bufferLoc[0]);
+            default:
+                break;
+        }
+    }
+
+    void SK_VECTORCALL get4Pixels(
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
+        *px0 = this->getPixelAt(bufferLoc[0]);
+        *px1 = this->getPixelAt(bufferLoc[1]);
+        *px2 = this->getPixelAt(bufferLoc[2]);
+        *px3 = this->getPixelAt(bufferLoc[3]);
+    }
+
+    void get4Pixels(
+        const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
+        *px0 = this->getPixelFromRow(src, index + 0);
+        *px1 = this->getPixelFromRow(src, index + 1);
+        *px2 = this->getPixelFromRow(src, index + 2);
+        *px3 = this->getPixelFromRow(src, index + 3);
+    }
+
+    Sk4f getPixelFromRow(const void* row, int index) const override {
+        const Element* src = static_cast<const Element*>(row);
+        return fConverter.toSk4f(src[index]);
+    }
+
+    Sk4f getPixelAt(int index) const override {
+        return this->getPixelFromRow(fSrc, index);
+    }
+
+    const void* row(int y) const override { return fSrc + y * fWidth; }
+
+private:
+    const Element* const                 fSrc;
+    const int                            fWidth;
+    PixelConverter<colorType, gammaType> fConverter;
+};
+
+// We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
+// We'll never re-use pixels, but we can at least load contiguous pixels.
+template <typename Next, typename Strategy>
+static void src_strategy_blend(Span span, Next* next, Strategy* strategy) {
+    SkPoint start;
+    SkScalar length;
+    int count;
+    std::tie(start, length, count) = span;
+    int ix = SkScalarFloorToInt(X(start));
+    const void* row = strategy->row((int)std::floor(Y(start)));
+    if (length > 0) {
+        while (count >= 4) {
+            Sk4f px0, px1, px2, px3;
+            strategy->get4Pixels(row, ix, &px0, &px1, &px2, &px3);
+            next->blend4Pixels(px0, px1, px2, px3);
+            ix += 4;
+            count -= 4;
+        }
+
+        while (count > 0) {
+            next->blendPixel(strategy->getPixelFromRow(row, ix));
+            ix += 1;
+            count -= 1;
+        }
+    } else {
+        while (count >= 4) {
+            Sk4f px0, px1, px2, px3;
+            strategy->get4Pixels(row, ix - 3, &px3, &px2, &px1, &px0);
+            next->blend4Pixels(px0, px1, px2, px3);
+            ix -= 4;
+            count -= 4;
+        }
+
+        while (count > 0) {
+            next->blendPixel(strategy->getPixelFromRow(row, ix));
+            ix -= 1;
+            count -= 1;
+        }
+    }
+}
+
+// -- NearestNeighborSampler -----------------------------------------------------------------------
+// NearestNeighborSampler - use nearest neighbor filtering to create runs of destination pixels.
+template<typename Accessor, typename Next>
+class NearestNeighborSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
+public:
+    template<typename... Args>
+    NearestNeighborSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next, Args&& ... args)
+    : fNext{next}, fAccessor{std::forward<Args>(args)...} { }
+
+    NearestNeighborSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next,
+    const NearestNeighborSampler& sampler)
+    : fNext{next}, fAccessor{sampler.fAccessor} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        SkASSERT(0 < n && n < 4);
+        Sk4f px0, px1, px2;
+        fAccessor.getFewPixels(n, SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2);
+        if (n >= 1) fNext->blendPixel(px0);
+        if (n >= 2) fNext->blendPixel(px1);
+        if (n >= 3) fNext->blendPixel(px2);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        Sk4f px0, px1, px2, px3;
+        fAccessor.get4Pixels(SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2, &px3);
+        fNext->blend4Pixels(px0, px1, px2, px3);
+    }
+
+    void pointSpan(Span span) override {
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength < (count - 1)) {
+            this->spanSlowRate(span);
+        } else if (absLength == (count - 1)) {
+            src_strategy_blend(span, fNext, &fAccessor);
+        } else {
+            this->spanFastRate(span);
+        }
+    }
+
+    void repeatSpan(Span span, int32_t repeatCount) override {
+        while (repeatCount > 0) {
+            this->pointSpan(span);
+            repeatCount--;
+        }
+    }
+
+private:
+    // When moving through source space more slowly than dst space (zoomed in),
+    // we'll be sampling from the same source pixel more than once.
+    void spanSlowRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkScalar x = X(start);
+        // fx is a fixed 48.16 number.
+        int64_t fx = static_cast<int64_t>(x * SK_Fixed1);
+        SkScalar dx = length / (count - 1);
+        // fdx is a fixed 48.16 number.
+        int64_t fdx = static_cast<int64_t>(dx * SK_Fixed1);
+
+        const void* row = fAccessor.row((int)std::floor(Y(start)));
+        Next* next = fNext;
+
+        int64_t ix = fx >> 16;
+        int64_t prevIX = ix;
+        Sk4f fpixel = fAccessor.getPixelFromRow(row, ix);
+
+        // When dx is less than one, each pixel is used more than once. Using the fixed point fx
+        // allows the code to quickly check that the same pixel is being used. The code uses this
+        // same pixel check to do the sRGB and normalization only once.
+        auto getNextPixel = [&]() {
+            if (ix != prevIX) {
+                fpixel = fAccessor.getPixelFromRow(row, ix);
+                prevIX = ix;
+            }
+            fx += fdx;
+            ix = fx >> 16;
+            return fpixel;
+        };
+
+        while (count >= 4) {
+            Sk4f px0 = getNextPixel();
+            Sk4f px1 = getNextPixel();
+            Sk4f px2 = getNextPixel();
+            Sk4f px3 = getNextPixel();
+            next->blend4Pixels(px0, px1, px2, px3);
+            count -= 4;
+        }
+        while (count > 0) {
+            next->blendPixel(getNextPixel());
+            count -= 1;
+        }
+    }
+
+    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // We'll never re-use pixels, but we can at least load contiguous pixels.
+    void spanUnitRate(Span span) {
+        src_strategy_blend(span, fNext, &fAccessor);
+    }
+
+    // We're moving through source space faster than dst (zoomed out),
+    // so we'll never reuse a source pixel or be able to do contiguous loads.
+    void spanFastRate(Span span) {
+        span_fallback(span, this);
+    }
+
+    Next* const fNext;
+    Accessor    fAccessor;
+};
+
+// From an edgeType, the integer value of a pixel vs, and the integer value of the extreme edge
+// vMax, take the point which might be off the tile by one pixel and either wrap it or pin it to
+// generate the right pixel. The value vs is on the interval [-1, vMax + 1]. It produces a value
+// on the interval [0, vMax].
+// Note: vMax is not width or height, but width-1 or height-1 because it is the largest valid pixel.
+static inline int adjust_edge(SkShader::TileMode edgeType, int vs, int vMax) {
+    SkASSERT(-1 <= vs && vs <= vMax + 1);
+    switch (edgeType) {
+        case SkShader::kClamp_TileMode:
+        case SkShader::kMirror_TileMode:
+            vs = std::max(vs, 0);
+            vs = std::min(vs, vMax);
+            break;
+        case SkShader::kRepeat_TileMode:
+            vs = (vs <= vMax) ? vs : 0;
+            vs =    (vs >= 0) ? vs : vMax;
+            break;
+    }
+    SkASSERT(0 <= vs && vs <= vMax);
+    return vs;
+}
+
+// From a sample point on the tile, return the top or left filter value.
+// The result r should be in the range (0, 1]. Since this represents the weight given to the top
+// left element, then if x == 0.5 the filter value should be 1.0.
+// The input sample point must be on the tile, therefore it must be >= 0.
+static SkScalar sample_to_filter(SkScalar x) {
+    SkASSERT(x >= 0.0f);
+    // The usual form of the top or left edge is x - .5, but since we are working on the unit
+    // square, then x + .5 works just as well. This also guarantees that v > 0.0 allowing the use
+    // of trunc.
+    SkScalar v = x + 0.5f;
+    // Produce the top or left offset a value on the range [0, 1).
+    SkScalar f = v - SkScalarTruncToScalar(v);
+    // Produce the filter value which is on the range (0, 1].
+    SkScalar r =  1.0f - f;
+    SkASSERT(0.0f < r && r <= 1.0f);
+    return r;
+}
+
+// -- BilerpSampler --------------------------------------------------------------------------------
+// BilerpSampler - use a bilerp filter to create runs of destination pixels.
+// Note: in the code below, there are two types of points
+//       * sample points - these are the points passed in by pointList* and Spans.
+//       * filter points - are created from a sample point to form the coordinates of the points
+//                         to use in the filter and to generate the filter values.
+template<typename Accessor, typename Next>
+class BilerpSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
+public:
+    template<typename... Args>
+    BilerpSampler(
+        SkLinearBitmapPipeline::BlendProcessorInterface* next,
+        SkISize dimensions,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        Args&& ... args
+    )
+        : fNext{next}
+        , fXEdgeType{xTile}
+        , fXMax{dimensions.width() - 1}
+        , fYEdgeType{yTile}
+        , fYMax{dimensions.height() - 1}
+        , fAccessor{std::forward<Args>(args)...} { }
+
+    BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next,
+                   const BilerpSampler& sampler)
+        : fNext{next}
+        , fXEdgeType{sampler.fXEdgeType}
+        , fXMax{sampler.fXMax}
+        , fYEdgeType{sampler.fYEdgeType}
+        , fYMax{sampler.fYMax}
+        , fAccessor{sampler.fAccessor} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        SkASSERT(0 < n && n < 4);
+        auto bilerpPixel = [&](int index) {
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
+        };
+
+        if (n >= 1) fNext->blendPixel(bilerpPixel(0));
+        if (n >= 2) fNext->blendPixel(bilerpPixel(1));
+        if (n >= 3) fNext->blendPixel(bilerpPixel(2));
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        auto bilerpPixel = [&](int index) {
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
+        };
+        fNext->blend4Pixels(bilerpPixel(0), bilerpPixel(1), bilerpPixel(2), bilerpPixel(3));
+    }
+
+    void pointSpan(Span span) override {
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+
+        // Nothing to do.
+        if (count == 0) {
+            return;
+        }
+
+        // Trivial case. No sample points are generated other than start.
+        if (count == 1) {
+            fNext->blendPixel(this->bilerpSamplePoint(start));
+            return;
+        }
+
+        // Note: the following code could be done in terms of dx = length / (count -1), but that
+        // would introduce a divide that is not needed for the most common dx == 1 cases.
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength == 0.0f) {
+            // |dx| == 0
+            // length is zero, so clamp an edge pixel.
+            this->spanZeroRate(span);
+        } else if (absLength < (count - 1)) {
+            // 0 < |dx| < 1.
+            this->spanSlowRate(span);
+        } else if (absLength == (count - 1)) {
+            // |dx| == 1.
+            if (sample_to_filter(span.startX()) == 1.0f
+                && sample_to_filter(span.startY()) == 1.0f) {
+                // All the pixels are aligned with the dest; go fast.
+                src_strategy_blend(span, fNext, &fAccessor);
+            } else {
+                // There is some sub-pixel offsets, so bilerp.
+                this->spanUnitRate(span);
+            }
+        } else if (absLength < 2.0f * (count - 1)) {
+            // 1 < |dx| < 2.
+            this->spanMediumRate(span);
+        } else {
+            // |dx| >= 2.
+            this->spanFastRate(span);
+        }
+    }
+
+    void repeatSpan(Span span, int32_t repeatCount) override {
+        while (repeatCount > 0) {
+            this->pointSpan(span);
+            repeatCount--;
+        }
+    }
+
+private:
+
+    // Convert a sample point to the points used by the filter.
+    void filterPoints(SkPoint sample, Sk4i* filterXs, Sk4i* filterYs) {
+        // May be less than zero. Be careful to use Floor.
+        int x0 = adjust_edge(fXEdgeType, SkScalarFloorToInt(X(sample) - 0.5), fXMax);
+        // Always greater than zero. Use the faster Trunc.
+        int x1 = adjust_edge(fXEdgeType, SkScalarTruncToInt(X(sample) + 0.5), fXMax);
+        int y0 = adjust_edge(fYEdgeType, SkScalarFloorToInt(Y(sample) - 0.5), fYMax);
+        int y1 = adjust_edge(fYEdgeType, SkScalarTruncToInt(Y(sample) + 0.5), fYMax);
+
+        *filterXs = Sk4i{x0, x1, x0, x1};
+        *filterYs = Sk4i{y0, y0, y1, y1};
+    }
+
+    // Given a sample point, generate a color by bilerping the four filter points.
+    Sk4f bilerpSamplePoint(SkPoint sample) {
+        Sk4i iXs, iYs;
+        filterPoints(sample, &iXs, &iYs);
+        Sk4f px00, px10, px01, px11;
+        fAccessor.get4Pixels(iXs, iYs, &px00, &px10, &px01, &px11);
+        return bilerp4(Sk4f{X(sample) - 0.5f}, Sk4f{Y(sample) - 0.5f}, px00, px10, px01, px11);
+    }
+
+    // Get two pixels at x from row0 and row1.
+    void get2PixelColumn(const void* row0, const void* row1, int x, Sk4f* px0, Sk4f* px1) {
+        *px0 = fAccessor.getPixelFromRow(row0, x);
+        *px1 = fAccessor.getPixelFromRow(row1, x);
+    }
+
+    // |dx| == 0. This code assumes that length is zero.
+    void spanZeroRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkASSERT(length == 0.0f);
+
+        // Filter for the blending of the top and bottom pixels.
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Get the two pixels that make up the clamping pixel.
+        Sk4f pxTop, pxBottom;
+        this->get2PixelColumn(row0, row1, SkScalarFloorToInt(X(start)), &pxTop, &pxBottom);
+        Sk4f pixel = pxTop * filterY + (1.0f - filterY) * pxBottom;
+
+        while (count >= 4) {
+            fNext->blend4Pixels(pixel, pixel, pixel, pixel);
+            count -= 4;
+        }
+        while (count > 0) {
+            fNext->blendPixel(pixel);
+            count -= 1;
+        }
+    }
+
+    // 0 < |dx| < 1. This code reuses the calculations from previous pixels to reduce
+    // computation. In particular, several destination pixels maybe generated from the same four
+    // source pixels.
+    // In the following code a "part" is a combination of two pixels from the same column of the
+    // filter.
+    void spanSlowRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT(-1.0f < dx && dx < 1.0f && dx != 0.0f);
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
+
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the partAtColumn function above.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
+
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&](const Sk4f& leftPart, const Sk4f& rightPart) {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
+        };
+
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp(leftPart, rightPart));
+        count -= 1;
+
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
+
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                if (filterX <= 0.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        } else {
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            // Advance the filter from right to left. Remember that moving the top-left corner of
+            // the filter to the left actually makes the filter value larger.
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point filterX may be > 1, and needs to be wrapped back on to the filter
+                // interval, and the next column in the filter is calculated.
+                if (filterX > 1.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        }
+    }
+
+    // |dx| == 1. Moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // Every filter part is used for two destination pixels, and the code can bulk load four
+    // pixels at a time.
+    void spanUnitRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkASSERT(SkScalarAbs(length) == (count - 1));
+
+        // Calculate the four filter points of start, and use the two different Y values to
+        // generate the row pointers.
+        Sk4i iXs, iYs;
+        filterPoints(start, &iXs, &iYs);
+        const void* row0 = fAccessor.row(iYs[0]);
+        const void* row1 = fAccessor.row(iYs[2]);
+
+        // Calculate the filter values for the top-left filter element.
+        const SkScalar filterX = sample_to_filter(X(start));
+        const SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
+
+        auto get4Parts = [&](int ix, Sk4f* part0, Sk4f* part1, Sk4f* part2, Sk4f* part3) {
+            // Check if the pixels needed are near the edges. If not go fast using bulk pixels,
+            // otherwise be careful.
+            if (0 <= ix && ix <= fXMax - 3) {
+                Sk4f px00, px10, px20, px30,
+                     px01, px11, px21, px31;
+                fAccessor.get4Pixels(row0, ix, &px00, &px10, &px20, &px30);
+                fAccessor.get4Pixels(row1, ix, &px01, &px11, &px21, &px31);
+                *part0 = filterY * px00 + (1.0f - filterY) * px01;
+                *part1 = filterY * px10 + (1.0f - filterY) * px11;
+                *part2 = filterY * px20 + (1.0f - filterY) * px21;
+                *part3 = filterY * px30 + (1.0f - filterY) * px31;
+            } else {
+                *part0 = partAtColumn(ix + 0);
+                *part1 = partAtColumn(ix + 1);
+                *part2 = partAtColumn(ix + 2);
+                *part3 = partAtColumn(ix + 3);
+            }
+        };
+
+        auto bilerp = [&](const Sk4f& part0, const Sk4f& part1) {
+            return part0 * filterX + part1 * (1.0f - filterX);
+        };
+
+        if (length > 0) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+
+            // overlapPart is the filter part from the end of the previous four pixels used at
+            // the start of the next four pixels.
+            Sk4f overlapPart = partAtColumn(iXs[0]);
+            int rightColumnCursor = iXs[1];
+            while (count >= 4) {
+                Sk4f part0, part1, part2, part3;
+                get4Parts(rightColumnCursor, &part0, &part1, &part2, &part3);
+                Sk4f px0 = bilerp(overlapPart, part0);
+                Sk4f px1 = bilerp(part0, part1);
+                Sk4f px2 = bilerp(part1, part2);
+                Sk4f px3 = bilerp(part2, part3);
+                overlapPart = part3;
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                rightColumnCursor += 4;
+                count -= 4;
+            }
+
+            while (count > 0) {
+                Sk4f rightPart = partAtColumn(rightColumnCursor);
+
+                fNext->blendPixel(bilerp(overlapPart, rightPart));
+                overlapPart = rightPart;
+                rightColumnCursor += 1;
+                count -= 1;
+            }
+        } else {
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            Sk4f overlapPart = partAtColumn(iXs[1]);
+            int leftColumnCursor = iXs[0];
+
+            while (count >= 4) {
+                Sk4f part0, part1, part2, part3;
+                get4Parts(leftColumnCursor - 3, &part3, &part2, &part1, &part0);
+                Sk4f px0 = bilerp(part0, overlapPart);
+                Sk4f px1 = bilerp(part1, part0);
+                Sk4f px2 = bilerp(part2, part1);
+                Sk4f px3 = bilerp(part3, part2);
+                overlapPart = part3;
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                leftColumnCursor -= 4;
+                count -= 4;
+            }
+
+            while (count > 0) {
+                Sk4f leftPart = partAtColumn(leftColumnCursor);
+
+                fNext->blendPixel(bilerp(leftPart, overlapPart));
+                overlapPart = leftPart;
+                leftColumnCursor -= 1;
+                count -= 1;
+            }
+        }
+    }
+
+    // 1 < |dx| < 2. Going through the source pixels at a faster rate than the dest pixels, but
+    // still slow enough to take advantage of previous calculations.
+    void spanMediumRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT((-2.0f < dx && dx < -1.0f) || (1.0f < dx && dx < 2.0f));
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
+
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the nextPart function below.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
+
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&](const Sk4f& leftPart, const Sk4f& rightPart) {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
+        };
+
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp(leftPart, rightPart));
+        count -= 1;
+
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
+
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                // At this point filterX is less than zero, but might actually be less than -1.
+                if (filterX > -1.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                } else {
+                    filterX += 2.0f;
+                    rightPartCursor += 2;
+                    leftPart = partAtColumn(rightPartCursor - 1);
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        } else {
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point, filterX is greater than one, but may actually be greater than two.
+                if (filterX < 2.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                } else {
+                    filterX -= 2.0f;
+                    leftPartCursor -= 2;
+                    rightPart = partAtColumn(leftPartCursor - 1);
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        }
+    }
+
+    // We're moving through source space faster than dst (zoomed out),
+    // so we'll never reuse a source pixel or be able to do contiguous loads.
+    void spanFastRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkScalar x = X(start);
+        SkScalar y = Y(start);
+
+        SkScalar dx = length / (count - 1);
+        while (count > 0) {
+            fNext->blendPixel(this->bilerpSamplePoint(SkPoint{x, y}));
+            x += dx;
+            count -= 1;
+        }
+    }
+
+    Next* const              fNext;
+    const SkShader::TileMode fXEdgeType;
+    const int                fXMax;
+    const SkShader::TileMode fYEdgeType;
+    const int                fYMax;
+    Accessor                 fAccessor;
+};
+
+}  // namespace
+
+#endif  // SkLinearBitmapPipeline_sampler_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_tile.h b/src/core/SkLinearBitmapPipeline_tile.h
new file mode 100644
index 0000000000..e18f7a1a5d
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline_tile.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_tile_DEFINED
+#define SkLinearBitmapPipeline_tile_DEFINED
+
+#include "SkLinearBitmapPipeline_core.h"
+#include "SkPM4f.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+namespace {
+
+void assertTiled(const Sk4s& vs, SkScalar vMax) {
+    SkASSERT(0 <= vs[0] && vs[0] < vMax);
+    SkASSERT(0 <= vs[1] && vs[1] < vMax);
+    SkASSERT(0 <= vs[2] && vs[2] < vMax);
+    SkASSERT(0 <= vs[3] && vs[3] < vMax);
+}
+
+/*
+ * Clamp in the X direction.
+ * Observations:
+ *   * sample pointer border - if the sample point is <= 0.5 or >= Max - 0.5 then the pixel
+ *     value should be a border color. For this case, create the span using clampToSinglePixel.
+ */
+class XClampStrategy {
+public:
+    XClampStrategy(int32_t max)
+        : fXMaxPixel{SkScalar(max - SK_ScalarHalf)}
+        , fXMax{SkScalar(max)} { }
+
+    void tileXPoints(Sk4s* xs) {
+        *xs = Sk4s::Min(Sk4s::Max(*xs, SK_ScalarHalf), fXMaxPixel);
+        assertTiled(*xs, fXMax);
+    }
+
+    template<typename Next>
+    bool maybeProcessSpan(Span originalSpan, Next* next) {
+        SkASSERT(!originalSpan.isEmpty());
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = originalSpan;
+        SkScalar x = X(start);
+        SkScalar y = Y(start);
+        Span span{{x, y}, length, count};
+
+        if (span.completelyWithin(0.0f, fXMax)) {
+            next->pointSpan(span);
+            return true;
+        }
+        if (1 == count || 0.0f == length) {
+            return false;
+        }
+
+        SkScalar dx = length / (count - 1);
+
+        //    A                 B     C
+        // +-------+-------+-------++-------+-------+-------+     +-------+-------++------
+        // |  *---*|---*---|*---*--||-*---*-|---*---|*---...|     |--*---*|---*---||*---*....
+        // |       |       |       ||       |       |       | ... |       |       ||
+        // |       |       |       ||       |       |       |     |       |       ||
+        // +-------+-------+-------++-------+-------+-------+     +-------+-------++------
+        //                         ^                                              ^
+        //                         | xMin                                  xMax-1 | xMax
+        //
+        //     *---*---*---... - track of samples. * = sample
+        //
+        //     +-+                                 ||
+        //     | |  - pixels in source space.      || - tile border.
+        //     +-+                                 ||
+        //
+        // The length from A to B is the length in source space or 4 * dx or (count - 1) * dx
+        // where dx is the distance between samples. There are 5 destination pixels
+        // corresponding to 5 samples specified in the A, B span. The distance from A to the next
+        // span starting at C is 5 * dx, so count * dx.
+        // Remember, count is the number of pixels needed for the destination and the number of
+        // samples.
+        // Overall Strategy:
+        // * Under - for portions of the span < xMin, take the color at pixel {xMin, y} and use it
+        //   to fill in the 5 pixel sampled from A to B.
+        // * Middle - for the portion of the span between xMin and xMax sample normally.
+        // * Over - for the portion of the span > xMax, take the color at pixel {xMax-1, y} and
+        //   use it to fill in the rest of the destination pixels.
+        if (dx >= 0) {
+            Span leftClamped = span.breakAt(SK_ScalarHalf, dx);
+            if (!leftClamped.isEmpty()) {
+                leftClamped.clampToSinglePixel({SK_ScalarHalf, y});
+                next->pointSpan(leftClamped);
+            }
+            Span center = span.breakAt(fXMax, dx);
+            if (!center.isEmpty()) {
+                next->pointSpan(center);
+            }
+            if (!span.isEmpty()) {
+                span.clampToSinglePixel({fXMaxPixel, y});
+                next->pointSpan(span);
+            }
+        } else {
+            Span rightClamped = span.breakAt(fXMax, dx);
+            if (!rightClamped.isEmpty()) {
+                rightClamped.clampToSinglePixel({fXMaxPixel, y});
+                next->pointSpan(rightClamped);
+            }
+            Span center = span.breakAt(SK_ScalarHalf, dx);
+            if (!center.isEmpty()) {
+                next->pointSpan(center);
+            }
+            if (!span.isEmpty()) {
+                span.clampToSinglePixel({SK_ScalarHalf, y});
+                next->pointSpan(span);
+            }
+        }
+        return true;
+    }
+
+private:
+    const SkScalar fXMaxPixel;
+    const SkScalar fXMax;
+};
+
+class YClampStrategy {
+public:
+    YClampStrategy(int32_t max)
+        : fYMaxPixel{SkScalar(max) - SK_ScalarHalf} { }
+
+    void tileYPoints(Sk4s* ys) {
+        *ys = Sk4s::Min(Sk4s::Max(*ys, SK_ScalarHalf), fYMaxPixel);
+        assertTiled(*ys, fYMaxPixel + SK_ScalarHalf);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        Sk4f ys{y};
+        tileYPoints(&ys);
+        return ys[0];
+    }
+
+private:
+    const SkScalar fYMaxPixel;
+};
+
+SkScalar tile_mod(SkScalar x, SkScalar base, SkScalar cap) {
+    // When x is a negative number *very* close to zero, the difference becomes 0 - (-base) = base
+    // which is an out of bound value. The min() corrects these problematic values.
+    return std::min(x - SkScalarFloorToScalar(x / base) * base, cap);
+}
+
+class XRepeatStrategy {
+public:
+    XRepeatStrategy(int32_t max)
+        : fXMax{SkScalar(max)}
+        , fXCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fXInvMax{1.0f / SkScalar(max)} { }
+
+    void tileXPoints(Sk4s* xs) {
+        Sk4s divX = *xs * fXInvMax;
+        Sk4s modX = *xs - divX.floor() * fXMax;
+        *xs = Sk4s::Min(fXCap, modX);
+        assertTiled(*xs, fXMax);
+    }
+
+    template<typename Next>
+    bool maybeProcessSpan(Span originalSpan, Next* next) {
+        SkASSERT(!originalSpan.isEmpty());
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = originalSpan;
+        // Make x and y in range on the tile.
+        SkScalar x = tile_mod(X(start), fXMax, fXCap);
+        SkScalar y = Y(start);
+        SkScalar dx = length / (count - 1);
+
+        // No need trying to go fast because the steps are larger than a tile or there is one point.
+        if (SkScalarAbs(dx) >= fXMax || count <= 1) {
+            return false;
+        }
+
+        //             A        B     C                  D                Z
+        // +-------+-------+-------++-------+-------+-------++     +-------+-------++------
+        // |       |   *---|*---*--||-*---*-|---*---|*---*--||     |--*---*|       ||
+        // |       |       |       ||       |       |       || ... |       |       ||
+        // |       |       |       ||       |       |       ||     |       |       ||
+        // +-------+-------+-------++-------+-------+-------++     +-------+-------++------
+        //                         ^^                       ^^                     ^^
+        //                    xMax || xMin             xMax || xMin           xMax || xMin
+        //
+        //     *---*---*---... - track of samples. * = sample
+        //
+        //     +-+                                 ||
+        //     | |  - pixels in source space.      || - tile border.
+        //     +-+                                 ||
+        //
+        //
+        // The given span starts at A and continues on through several tiles to sample point Z.
+        // The idea is to break this into several spans one on each tile the entire span
+        // intersects. The A to B span only covers a partial tile and has a count of 3 and the
+        // distance from A to B is (count - 1) * dx or 2 * dx. The distance from A to the start of
+        // the next span is count * dx or 3 * dx. Span C to D covers an entire tile has a count
+        // of 5 and a length of 4 * dx. Remember, count is the number of pixels needed for the
+        // destination and the number of samples.
+        //
+        // Overall Strategy:
+        // While the span hangs over the edge of the tile, draw the span covering the tile then
+        // slide the span over to the next tile.
+
+        // The guard could have been count > 0, but then a bunch of math would be done in the
+        // common case.
+
+        Span span({x, y}, length, count);
+        if (dx > 0) {
+            while (!span.isEmpty() && span.endX() >= fXMax) {
+                Span toDraw = span.breakAt(fXMax, dx);
+                next->pointSpan(toDraw);
+                span.offset(-fXMax);
+            }
+        } else {
+            while (!span.isEmpty() && span.endX() < 0.0f) {
+                Span toDraw = span.breakAt(0.0f, dx);
+                next->pointSpan(toDraw);
+                span.offset(fXMax);
+            }
+        }
+
+        // All on a single tile.
+        if (!span.isEmpty()) {
+            next->pointSpan(span);
+        }
+
+        return true;
+    }
+
+private:
+    const SkScalar fXMax;
+    const SkScalar fXCap;
+    const SkScalar fXInvMax;
+};
+
+// The XRepeatUnitScaleStrategy exploits the situation where dx = 1.0. The main advantage is that
+// the relationship between the sample points and the source pixels does not change from tile to
+// repeated tile. This allows the tiler to calculate the span once and re-use it for each
+// repeated tile. This is later exploited by some samplers to avoid converting pixels to linear
+// space allowing the use of memmove to place pixel in the destination.
+class XRepeatUnitScaleStrategy {
+public:
+    XRepeatUnitScaleStrategy(int32_t max)
+        : fXMax{SkScalar(max)}
+        , fXCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fXInvMax{1.0f / SkScalar(max)} { }
+
+    void tileXPoints(Sk4s* xs) {
+        Sk4s divX = *xs * fXInvMax;
+        Sk4s modX = *xs - divX.floor() * fXMax;
+        *xs = Sk4s::Min(fXCap, modX);
+        assertTiled(*xs, fXMax);
+    }
+
+    template<typename Next>
+    bool maybeProcessSpan(Span originalSpan, Next* next) {
+        SkASSERT(!originalSpan.isEmpty());
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = originalSpan;
+        // Make x and y in range on the tile.
+        SkScalar x = tile_mod(X(start), fXMax, fXCap);
+        SkScalar y = Y(start);
+
+        // No need trying to go fast because the steps are larger than a tile or there is one point.
+        if (fXMax == 1 || count <= 1) {
+            return false;
+        }
+
+        // x should be on the tile.
+        SkASSERT(0.0f <= x && x < fXMax);
+        Span span({x, y}, length, count);
+
+        if (SkScalarFloorToScalar(x) != 0.0f) {
+            Span toDraw = span.breakAt(fXMax, 1.0f);
+            SkASSERT(0.0f <= toDraw.startX() && toDraw.endX() < fXMax);
+            next->pointSpan(toDraw);
+            span.offset(-fXMax);
+        }
+
+        // All of the span could have been on the first tile. If so, then no work to do.
+        if (span.isEmpty()) return true;
+
+        // At this point the span should be aligned to zero.
+        SkASSERT(SkScalarFloorToScalar(span.startX()) == 0.0f);
+
+        // Note: The span length has an unintuitive relation to the tile width. The tile width is
+        // a half open interval [tb, te), but the span is a closed interval [sb, se]. In order to
+        // compare the two, you need to convert the span to a half open interval. This is done by
+        // adding dx to se. So, the span becomes: [sb, se + dx). Hence the + 1.0f below.
+        SkScalar div = (span.length() + 1.0f) / fXMax;
+        int32_t repeatCount = SkScalarFloorToInt(div);
+        Span repeatableSpan{{0.0f, y}, fXMax - 1.0f, SkScalarFloorToInt(fXMax)};
+
+        // Repeat the center section.
+        SkASSERT(0.0f <= repeatableSpan.startX() && repeatableSpan.endX() < fXMax);
+        if (repeatCount > 0) {
+            next->repeatSpan(repeatableSpan, repeatCount);
+        }
+
+        // Calculate the advance past the center portion.
+        SkScalar advance = SkScalar(repeatCount) * fXMax;
+
+        // There may be some of the span left over.
+        span.breakAt(advance, 1.0f);
+
+        // All on a single tile.
+        if (!span.isEmpty()) {
+            span.offset(-advance);
+            SkASSERT(0.0f <= span.startX() && span.endX() < fXMax);
+            next->pointSpan(span);
+        }
+
+        return true;
+    }
+
+private:
+    const SkScalar fXMax;
+    const SkScalar fXCap;
+    const SkScalar fXInvMax;
+};
+
+class YRepeatStrategy {
+public:
+    YRepeatStrategy(int32_t max)
+        : fYMax{SkScalar(max)}
+        , fYCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fYsInvMax{1.0f / SkScalar(max)} { }
+
+    void tileYPoints(Sk4s* ys) {
+        Sk4s divY = *ys * fYsInvMax;
+        Sk4s modY = *ys - divY.floor() * fYMax;
+        *ys = Sk4s::Min(fYCap, modY);
+        assertTiled(*ys, fYMax);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        SkScalar answer = tile_mod(y, fYMax, fYCap);
+        SkASSERT(0 <= answer && answer < fYMax);
+        return answer;
+    }
+
+private:
+    const SkScalar fYMax;
+    const SkScalar fYCap;
+    const SkScalar fYsInvMax;
+};
+// max = 40
+// mq2[x_] := Abs[(x - 40) - Floor[(x - 40)/80] * 80 - 40]
+class XMirrorStrategy {
+public:
+    XMirrorStrategy(int32_t max)
+        : fXMax{SkScalar(max)}
+        , fXCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fXDoubleInvMax{1.0f / (2.0f * SkScalar(max))} { }
+
+    void tileXPoints(Sk4s* xs) {
+        Sk4f bias   = *xs - fXMax;
+        Sk4f div    = bias * fXDoubleInvMax;
+        Sk4f mod    = bias - div.floor() * 2.0f * fXMax;
+        Sk4f unbias = mod - fXMax;
+        *xs = Sk4f::Min(unbias.abs(), fXCap);
+        assertTiled(*xs, fXMax);
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span originalSpan, Next* next) { return false; }
+
+private:
+    SkScalar fXMax;
+    SkScalar fXCap;
+    SkScalar fXDoubleInvMax;
+};
+
+class YMirrorStrategy {
+public:
+    YMirrorStrategy(int32_t max)
+        : fYMax{SkScalar(max)}
+        , fYCap{nextafterf(SkScalar(max), 0.0f)}
+        , fYDoubleInvMax{1.0f / (2.0f * SkScalar(max))} { }
+
+    void tileYPoints(Sk4s* ys) {
+        Sk4f bias   = *ys - fYMax;
+        Sk4f div    = bias * fYDoubleInvMax;
+        Sk4f mod    = bias - div.floor() * 2.0f * fYMax;
+        Sk4f unbias = mod - fYMax;
+        *ys = Sk4f::Min(unbias.abs(), fYCap);
+        assertTiled(*ys, fYMax);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        SkScalar bias   = y - fYMax;
+        SkScalar div    = bias * fYDoubleInvMax;
+        SkScalar mod    = bias - SkScalarFloorToScalar(div) * 2.0f * fYMax;
+        SkScalar unbias = mod - fYMax;
+        SkScalar answer = SkMinScalar(SkScalarAbs(unbias), fYCap);
+        SkASSERT(0 <= answer && answer < fYMax);
+        return answer;
+    }
+
+private:
+    SkScalar fYMax;
+    SkScalar fYCap;
+    SkScalar fYDoubleInvMax;
+};
+
+}  // namespace
+#endif  // SkLinearBitmapPipeline_tile_DEFINED
diff --git a/src/shaders/SkBitmapProcShader.cpp b/src/shaders/SkBitmapProcShader.cpp
index 1a87491bf4..91697e2f1b 100644
--- a/src/shaders/SkBitmapProcShader.cpp
+++ b/src/shaders/SkBitmapProcShader.cpp
@@ -100,6 +100,79 @@ private:
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "SkLinearBitmapPipeline.h"
+#include "SkPM4f.h"
+
+class LinearPipelineContext : public BitmapProcInfoContext {
+public:
+    LinearPipelineContext(const SkShaderBase& shader, const SkShaderBase::ContextRec& rec,
+                          SkBitmapProcInfo* info, SkArenaAlloc* alloc)
+        : INHERITED(shader, rec, info), fAllocator{alloc}
+    {
+        // Save things off in case we need to build a blitter pipeline.
+        fSrcPixmap = info->fPixmap;
+        fAlpha = SkColorGetA(info->fPaintColor) / 255.0f;
+        fFilterQuality = info->fFilterQuality;
+        fMatrixTypeMask = info->fRealInvMatrix.getType();
+
+        fShaderPipeline = alloc->make<SkLinearBitmapPipeline>(
+            info->fRealInvMatrix, info->fFilterQuality,
+            info->fTileModeX, info->fTileModeY,
+            info->fPaintColor,
+            info->fPixmap,
+            fAllocator);
+    }
+
+    void shadeSpan4f(int x, int y, SkPM4f dstC[], int count) override {
+        fShaderPipeline->shadeSpan4f(x, y, dstC, count);
+    }
+
+    void shadeSpan(int x, int y, SkPMColor dstC[], int count) override {
+        const int N = 128;
+        SkPM4f  tmp[N];
+
+        while (count > 0) {
+            const int n = SkTMin(count, N);
+            fShaderPipeline->shadeSpan4f(x, y, tmp, n);
+            // now convert to SkPMColor
+            for (int i = 0; i < n; ++i) {
+                dstC[i] = Sk4f_toL32(tmp[i].to4f_pmorder());
+            }
+            dstC += n;
+            x += n;
+            count -= n;
+        }
+    }
+
+private:
+    // Store the allocator from the context creation incase we are asked to build a blitter.
+    SkArenaAlloc*           fAllocator;
+    SkLinearBitmapPipeline* fShaderPipeline;
+    SkPixmap                fSrcPixmap;
+    float                   fAlpha;
+    SkMatrix::TypeMask      fMatrixTypeMask;
+    SkFilterQuality         fFilterQuality;
+
+    typedef BitmapProcInfoContext INHERITED;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static bool choose_linear_pipeline(const SkShaderBase::ContextRec& rec, const SkImageInfo& srcInfo) {
+    // If we get here, we can reasonably use either context, respect the caller's preference
+    //
+    bool needsPremul = srcInfo.alphaType() == kUnpremul_SkAlphaType;
+    bool needsSwizzle = srcInfo.bytesPerPixel() == 4 && srcInfo.colorType() != kN32_SkColorType;
+    return SkShaderBase::ContextRec::kPM4f_DstType == rec.fPreferredDstType
+           || needsPremul || needsSwizzle;
+}
+
+size_t SkBitmapProcLegacyShader::ContextSize(const ContextRec& rec, const SkImageInfo& srcInfo) {
+    size_t size0 = sizeof(BitmapProcShaderContext) + sizeof(SkBitmapProcState);
+    size_t size1 = sizeof(LinearPipelineContext) + sizeof(SkBitmapProcInfo);
+    size_t s = SkTMax(size0, size1);
+    return s;
+}
 
 SkShaderBase::Context* SkBitmapProcLegacyShader::MakeContext(
     const SkShaderBase& shader, TileMode tmx, TileMode tmy,
@@ -111,10 +184,21 @@ SkShaderBase::Context* SkBitmapProcLegacyShader::MakeContext(
         return nullptr;
     }
 
-    SkBitmapProcState* state = alloc->make<SkBitmapProcState>(provider, tmx, tmy);
-    if (!state->setup(totalInverse, *rec.fPaint)) {
-        return nullptr;
-    }
-    return alloc->make<BitmapProcShaderContext>(shader, rec, state);
+    // Decide if we can/want to use the new linear pipeline
+    bool useLinearPipeline = choose_linear_pipeline(rec, provider.info());
 
+    if (useLinearPipeline) {
+        SkBitmapProcInfo* info = alloc->make<SkBitmapProcInfo>(provider, tmx, tmy);
+        if (!info->init(totalInverse, *rec.fPaint)) {
+            return nullptr;
+        }
+
+        return alloc->make<LinearPipelineContext>(shader, rec, info, alloc);
+    } else {
+        SkBitmapProcState* state = alloc->make<SkBitmapProcState>(provider, tmx, tmy);
+        if (!state->setup(totalInverse, *rec.fPaint)) {
+            return nullptr;
+        }
+        return alloc->make<BitmapProcShaderContext>(shader, rec, state);
+    }
 }
diff --git a/src/shaders/SkBitmapProcShader.h b/src/shaders/SkBitmapProcShader.h
index 7c5cdcfb8d..2a2599cb1d 100644
--- a/src/shaders/SkBitmapProcShader.h
+++ b/src/shaders/SkBitmapProcShader.h
@@ -16,6 +16,7 @@ class SkBitmapProcLegacyShader : public SkShaderBase {
 private:
     friend class SkImageShader;
 
+    static size_t ContextSize(const ContextRec&, const SkImageInfo& srcInfo);
     static Context* MakeContext(const SkShaderBase&, TileMode tmx, TileMode tmy,
                                 const SkBitmapProvider&, const ContextRec&, SkArenaAlloc* alloc);