aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/core/SkMaskBlurFilter.cpp369
-rw-r--r--src/opts/SkNx_neon.h5
-rw-r--r--src/opts/SkNx_sse.h29
3 files changed, 285 insertions, 118 deletions
diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp
index 294b4a6322..0ab85000a4 100644
--- a/src/core/SkMaskBlurFilter.cpp
+++ b/src/core/SkMaskBlurFilter.cpp
@@ -11,6 +11,7 @@
#include <climits>
#include "SkArenaAlloc.h"
+#include "SkNx.h"
#include "SkSafeMath.h"
static const double kPi = 3.14159265358979323846264338327950288;
@@ -25,7 +26,13 @@ class BlurScanInterface {
public:
virtual ~BlurScanInterface() = default;
virtual void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
- uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0;
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0;
+ virtual bool canBlur4() { return false; }
+ virtual void blur4Transpose(
+ const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const {
+ SK_ABORT("This should not be called.");
+ }
};
class PlanningInterface {
@@ -49,120 +56,146 @@ public:
SK_ABORT("Should never be called.");
return nullptr;
}
-private:
};
-// This class is deprecated, and will be replaced by Box.
-class PlanBoxInteger final : public PlanningInterface {
+// Old slower version of Box which uses 64 bit multiply instead of 32 bit multiple.
+// Controlled by SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR
+class PlanBox32 final : public PlanningInterface {
public:
- explicit PlanBoxInteger(double sigma) {
-
+ explicit PlanBox32(double sigma) {
// Calculate the radius from sigma. Taken from the old code until something better is
// figured out.
auto possibleRadius = 1.5 * sigma - 0.5;
- auto radius = std::max(1.0, std::ceil(possibleRadius));
- auto window = 2 * radius + 1;
+ auto radius = std::max(std::numeric_limits<double>::epsilon(), possibleRadius);
+ auto outerRadius = std::ceil(radius);
+ auto outerWindow = 2 * outerRadius + 1;
+ auto outerFactor = (1 - (outerRadius - radius)) / outerWindow;
+ fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32)));
+
+ auto innerRadius = outerRadius - 1;
+ auto innerWindow = 2 * innerRadius + 1;
+ auto innerFactor = (1 - (radius - innerRadius)) / innerWindow;
+ fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32)));
- fWindow = static_cast<size_t>(window);
+ // Sliding window is defined by the relationship between the outer and inner widows.
+ // In the single window case, you add the element on the right, and subtract the element on
+ // the left. But, because two windows are used, this relationship is more complicated; an
+ // element is added from the right of the outer window, and subtracted from the left of the
+ // inner window. Because innerWindow = outerWindow - 2, the distance between
+ // the left and right in the two window case is outerWindow - 1.
+ fSlidingWindow = static_cast<size_t>(outerWindow - 1);
}
- size_t bufferSize() const override { return fWindow - 1; }
+ size_t bufferSize() const override { return 0; }
- size_t border() const override { return (fWindow - 1) / 2; }
+ // Remember that sliding window = window - 1. Therefore, radius = sliding window / 2.
+ size_t border() const override { return fSlidingWindow / 2; }
bool needsBlur() const override { return true; }
BlurScanInterface* makeBlurScan(
SkArenaAlloc* alloc, size_t width, uint32_t* buffer) const override
{
- auto weight = static_cast<uint64_t>(round(1.0 / fWindow * (1ull << 32)));
- auto bufferEnd = buffer + this->bufferSize();
- size_t noChangeCount = fWindow > width ? fWindow - width : 0;
+ size_t noChangeCount;
+ size_t trailingEdgeZeroCount;
+
+ // The relation between the slidingWindow and the width dictates two operating modes.
+ // * width >= slidingWindow - both sides of the window are contained in the image while
+ // scanning. Therefore, we assume that slidingWindow zeros are consumed on the trailing
+ // edge of the window. After this count, then both edges are traversing the image.
+ // * slidingWindow > width - both sides of the window are off the image while scanning
+ // the middle. The front edge of the window can only travel width until it falls off the
+ // image. At this point, both edges of the window are off the image consuming zeros
+ // and therefore, the destination value does not change. The scan produces unchanged
+ // values until the trailing edge of the window enters the image. This count is
+ // slidingWindow - width.
+ if (width >= fSlidingWindow) {
+ noChangeCount = 0;
+ trailingEdgeZeroCount = fSlidingWindow;
+ } else {
+ noChangeCount = fSlidingWindow - width;
+ trailingEdgeZeroCount = width;
+ }
+ return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount);
- return alloc->make<BoxInteger>(weight, buffer, bufferEnd, noChangeCount);
}
private:
- class BoxInteger final : public BlurScanInterface {
+ class Box final : public BlurScanInterface {
public:
- BoxInteger(uint64_t weight, uint32_t* buffer, uint32_t* bufferEnd, size_t noChangeCount)
- : fWeight{weight}
- , fBuffer0{buffer}
- , fBuffer0End{bufferEnd}
- , fNoChangeCount{noChangeCount} { }
+ Box(uint64_t outerWeight, uint64_t innerWeight,
+ size_t noChangeCount, size_t trailingEdgeZeroCount)
+ : fOuterWeight{outerWeight}
+ , fInnerWeight{innerWeight}
+ , fNoChangeCount{noChangeCount}
+ , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { }
void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
- auto buffer0Cursor = fBuffer0;
- std::memset(&fBuffer0[0], 0, (fBuffer0End - fBuffer0) * sizeof(*fBuffer0));
- uint32_t sum0 = 0;
-
- // Consume the source generating pixels.
- for (auto srcCursor = src;
- srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) {
- uint32_t s = *srcCursor;
- sum0 += s;
+ auto rightOuter = src;
+ auto dstCursor = dst;
- *dst = this->finalScale(sum0);
+ uint32_t outerSum = 0;
+ uint32_t innerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ innerSum = outerSum;
+ outerSum += *rightOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
+ rightOuter += srcStride;
+ dstCursor += dstStride;
}
- // This handles the case when both ends of the box are not between [src, srcEnd), and
- // both are zero at that point.
+ // slidingWindow > width
for (size_t i = 0; i < fNoChangeCount; i++) {
- uint32_t s = 0;
- sum0 += s;
-
- *dst = this->finalScale(sum0);
-
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
- dst += dstStride;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);;
+ dstCursor += dstStride;
}
- // Starting from the right, fill in the rest of the buffer.
- std::memset(&fBuffer0[0], 0, (fBuffer0End - &fBuffer0[0]) * sizeof(fBuffer0[0]));
+ // width > slidingWindow
+ auto leftInner = src;
+ while (rightOuter < srcEnd) {
+ innerSum = outerSum - *leftInner;
+ outerSum += *rightOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
+ outerSum -= *leftInner;
- sum0 = 0;
+ rightOuter += srcStride;
+ leftInner += srcStride;
+ dstCursor += dstStride;
+ }
- uint8_t* dstCursor = dstEnd;
- const uint8_t* srcCursor = srcEnd;
- do {
+ auto leftOuter = srcEnd;
+ dstCursor = dstEnd;
+ outerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ leftOuter -= srcStride;
dstCursor -= dstStride;
- srcCursor -= srcStride;
- uint32_t s = *srcCursor;
- sum0 += s;
- *dstCursor = this->finalScale(sum0);
-
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
- } while (dstCursor > dst);
+ innerSum = outerSum;
+ outerSum += *leftOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
+ }
}
+ private:
static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
- uint8_t finalScale(uint32_t sum) const {
- return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
+ uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const {
+ return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32);
}
-
- const uint64_t fWeight;
- uint32_t* const fBuffer0;
- uint32_t* const fBuffer0End;
- const size_t fNoChangeCount;
+ uint64_t fOuterWeight;
+ uint64_t fInnerWeight;
+ size_t fNoChangeCount;
+ size_t fTrailingEdgeZeroCount;
};
-
- size_t fWindow;
+private:
+ uint64_t fOuterWeight;
+ uint64_t fInnerWeight;
+ size_t fSlidingWindow;
};
+
class PlanBox final : public PlanningInterface {
public:
explicit PlanBox(double sigma) {
@@ -173,12 +206,12 @@ public:
auto outerRadius = std::ceil(radius);
auto outerWindow = 2 * outerRadius + 1;
auto outerFactor = (1 - (outerRadius - radius)) / outerWindow;
- fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32)));
+ fOuterWeight = static_cast<uint32_t>(round(outerFactor * (1ull << 24)));
auto innerRadius = outerRadius - 1;
auto innerWindow = 2 * innerRadius + 1;
auto innerFactor = (1 - (radius - innerRadius)) / innerWindow;
- fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32)));
+ fInnerWeight = static_cast<uint32_t>(round(innerFactor * (1ull << 24)));
// Sliding window is defined by the relationship between the outer and inner widows.
// In the single window case, you add the element on the right, and subtract the element on
@@ -189,7 +222,9 @@ public:
fSlidingWindow = static_cast<size_t>(outerWindow - 1);
}
- size_t bufferSize() const override { return 0; }
+ size_t bufferSize() const override {
+ return fSlidingWindow * (sizeof(Sk4u) / sizeof(uint32_t));
+ }
// Remember that sliding window = window - 1. Therefore, radius = sliding window / 2.
size_t border() const override { return fSlidingWindow / 2; }
@@ -219,31 +254,41 @@ public:
noChangeCount = fSlidingWindow - width;
trailingEdgeZeroCount = width;
}
- return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount);
+ Sk4u* sk4uBuffer = reinterpret_cast<Sk4u*>(buffer);
+ return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount,
+ sk4uBuffer, sk4uBuffer + fSlidingWindow);
}
private:
class Box final : public BlurScanInterface {
public:
- Box(uint64_t outerWeight, uint64_t innerWeight,
- size_t noChangeCount, size_t trailingEdgeZeroCount)
+ Box(uint32_t outerWeight, uint32_t innerWeight,
+ size_t noChangeCount, size_t trailingEdgeZeroCount,
+ Sk4u* buffer, Sk4u* bufferEnd)
: fOuterWeight{outerWeight}
, fInnerWeight{innerWeight}
, fNoChangeCount{noChangeCount}
- , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { }
+ , fTrailingEdgeZeroCount{trailingEdgeZeroCount}
+ , fBuffer{buffer}
+ , fBufferEnd{bufferEnd} { }
void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
- uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
auto rightOuter = src;
auto dstCursor = dst;
+ auto interpolateSums = [this](uint32_t outerSum, uint32_t innerSum) {
+ return SkTo<uint8_t>(
+ (fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 24);
+ };
+
uint32_t outerSum = 0;
uint32_t innerSum = 0;
for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
innerSum = outerSum;
outerSum += *rightOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
rightOuter += srcStride;
dstCursor += dstStride;
@@ -251,7 +296,7 @@ private:
// slidingWindow > width
for (size_t i = 0; i < fNoChangeCount; i++) {
- *dstCursor = this->interpolateSums(outerSum, innerSum);;
+ *dstCursor = interpolateSums(outerSum, innerSum);;
dstCursor += dstStride;
}
@@ -260,7 +305,7 @@ private:
while (rightOuter < srcEnd) {
innerSum = outerSum - *leftInner;
outerSum += *rightOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
outerSum -= *leftInner;
rightOuter += srcStride;
@@ -277,24 +322,108 @@ private:
innerSum = outerSum;
outerSum += *leftOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
}
}
- private:
- static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
+ bool canBlur4() override { return true; }
- uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const {
- return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32);
+ // NB this is a transposing scan. The next src is src+1, and the next down is
+ // src+srcStride.
+ void blur4Transpose(
+ const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
+ auto rightOuter = src;
+ auto dstCursor = dst;
+
+ Sk4u* const bufferStart = fBuffer;
+ Sk4u* bufferCursor = bufferStart;
+ Sk4u* const bufferEnd = fBufferEnd;
+
+ const Sk4u outerWeight(SkTo<uint32_t>(fOuterWeight));
+ const Sk4u innerWeight(SkTo<uint32_t>(fInnerWeight));
+
+ auto load = [](const uint8_t* cursor, size_t stride) -> Sk4u {
+ return Sk4u(cursor[0*stride], cursor[1*stride], cursor[2*stride], cursor[3*stride]);
+ };
+
+ auto interpolateSums = [&] (const Sk4u& outerSum, const Sk4u& innerSum) {
+ return
+ SkNx_cast<uint8_t>(
+ (outerSum * outerWeight + innerSum * innerWeight + kHalf) >> 24);
+ };
+
+ Sk4u outerSum = 0;
+ Sk4u innerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ innerSum = outerSum;
+
+ Sk4u leadingEdge = load(rightOuter, srcStride);
+ outerSum += leadingEdge;
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+
+ leadingEdge.store(bufferCursor);
+ bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart;
+
+ rightOuter += 1;
+ dstCursor += dstStride;
+ }
+
+ // slidingWindow > width
+ for (size_t i = 0; i < fNoChangeCount; i++) {
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+ dstCursor += dstStride;
+ }
+
+ // width > slidingWindow
+ auto leftInner = src;
+ while (rightOuter < srcEnd) {
+ Sk4u trailEdge = Sk4u::Load(bufferCursor);
+ Sk4u leadingEdge = load(rightOuter, srcStride);
+ innerSum = outerSum - trailEdge;
+ outerSum += leadingEdge;
+
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+
+ outerSum -= trailEdge;
+ leadingEdge.store(bufferCursor);
+ bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart;
+
+ rightOuter += 1;
+ leftInner += 1;
+ dstCursor += dstStride;
+ }
+
+ auto leftOuter = srcEnd;
+ dstCursor = dstEnd;
+ outerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ leftOuter -= 1;
+ dstCursor -= dstStride;
+
+ innerSum = outerSum;
+ outerSum += load(leftOuter, srcStride);
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+ }
}
- uint64_t fOuterWeight;
- uint64_t fInnerWeight;
- size_t fNoChangeCount;
- size_t fTrailingEdgeZeroCount;
+
+ private:
+ static constexpr uint32_t kHalf = static_cast<uint32_t>(1) << 23;
+
+ const uint32_t fOuterWeight;
+ const uint32_t fInnerWeight;
+ const size_t fNoChangeCount;
+ const size_t fTrailingEdgeZeroCount;
+ Sk4u* const fBuffer;
+ Sk4u* const fBufferEnd;
};
private:
- uint64_t fOuterWeight;
- uint64_t fInnerWeight;
+ uint32_t fOuterWeight;
+ uint32_t fInnerWeight;
size_t fSlidingWindow;
};
@@ -421,8 +550,8 @@ public:
// Consume the source generating pixels.
for (auto srcCursor = src;
srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) {
- uint32_t s = *srcCursor;
- sum0 += s;
+ uint32_t leadingEdge = *srcCursor;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -437,13 +566,14 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
}
+ // The leading edge is off the right side of the mask.
for (size_t i = 0; i < fNoChangeCount; i++) {
- uint32_t s = 0;
- sum0 += s;
+ uint32_t leadingEdge = 0;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -458,7 +588,7 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
dst += dstStride;
@@ -474,8 +604,8 @@ public:
while (dstCursor > dst) {
dstCursor -= dstStride;
srcCursor -= srcStride;
- uint32_t s = *srcCursor;
- sum0 += s;
+ uint32_t leadingEdge = *srcCursor;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -490,7 +620,7 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
}
}
@@ -526,8 +656,8 @@ static PlanningInterface* make_plan(SkArenaAlloc* alloc, double sigma) {
if (3 * sigma <= 1) {
plan = alloc->make<None>();
} else if (sigma < kSmallSigma) {
- #if defined(SK_LEGACY_SUPPORT_INTEGER_SMALL_RADII)
- plan = alloc->make<PlanBoxInteger>(sigma);
+ #if defined(SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR)
+ plan = alloc->make<PlanBox32>(sigma);
#else
plan = alloc->make<PlanBox>(sigma);
#endif
@@ -598,21 +728,44 @@ SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
// Blur both directions.
size_t tmpW = srcH;
size_t tmpH = dstW;
+
auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
// Blur horizontally, and transpose.
auto scanW = planW->makeBlurScan(&alloc, srcW, buffer);
- for (size_t y = 0; y < srcH; y++) {
+ size_t y = 0;
+ if (scanW->canBlur4() && srcH > 4) {
+ for (;y + 4 <= srcH; y += 4) {
+ auto srcStart = &src.fImage[y * src.fRowBytes];
+ auto tmpStart = &tmp[y];
+ scanW->blur4Transpose(srcStart, src.fRowBytes, srcStart + srcW,
+ tmpStart, tmpW, tmpStart + tmpW * tmpH);
+ }
+ }
+
+ for (;y < srcH; y++) {
auto srcStart = &src.fImage[y * src.fRowBytes];
auto tmpStart = &tmp[y];
- scanW->blur(srcStart, 1, srcStart + srcW,
+ scanW->blur(srcStart, 1, srcStart + srcW,
tmpStart, tmpW, tmpStart + tmpW * tmpH);
}
+
// Blur vertically (scan in memory order because of the transposition),
// and transpose back to the original orientation.
auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer);
- for (size_t y = 0; y < tmpH; y++) {
+ y = 0;
+ if (scanH->canBlur4() && tmpH > 4) {
+ for (;y + 4 <= tmpH; y += 4) {
+ auto tmpStart = &tmp[y * tmpW];
+ auto dstStart = &dst->fImage[y];
+
+ scanH->blur4Transpose(
+ tmpStart, tmpW, tmpStart + tmpW,
+ dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
+ }
+ }
+ for (;y < tmpH; y++) {
auto tmpStart = &tmp[y * tmpW];
auto dstStart = &dst->fImage[y];
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 4d7aefc93f..b906a02c82 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -564,6 +564,11 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
return vqmovn_u16(vcombine_u16(_16, _16));
}
+template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
+ uint16x4_t _16 = vqmovn_u32(src.fVec);
+ return vqmovn_u16(vcombine_u16(_16, _16));
+}
+
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
}
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 415d6e1205..469aefb145 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -165,6 +165,17 @@ public:
__m128 fVec;
};
+AI static __m128i mullo32(__m128i a, __m128i b) {
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+ return _mm_mullo_epi32(a, b);
+#else
+ __m128i mul20 = _mm_mul_epu32(a, b),
+ mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
+ _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
+#endif
+}
+
template <>
class SkNx<4, int32_t> {
public:
@@ -179,15 +190,10 @@ public:
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
- AI SkNx operator * (const SkNx& o) const {
- __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
- mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
- _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
- }
+ AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
- AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+ AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
@@ -254,10 +260,10 @@ public:
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
- // Not quite sure how to best do operator * in SSE2. We probably don't use it.
+ AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
- AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+ AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
@@ -465,7 +471,6 @@ public:
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
: fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {}
-
AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
@@ -612,6 +617,10 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
}
+template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
+ return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
+}
+
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
return src.fVec;
}