diff options
-rw-r--r-- | src/core/SkMaskBlurFilter.cpp | 369 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 5 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 29 |
3 files changed, 285 insertions, 118 deletions
diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp index 294b4a6322..0ab85000a4 100644 --- a/src/core/SkMaskBlurFilter.cpp +++ b/src/core/SkMaskBlurFilter.cpp @@ -11,6 +11,7 @@ #include <climits> #include "SkArenaAlloc.h" +#include "SkNx.h" #include "SkSafeMath.h" static const double kPi = 3.14159265358979323846264338327950288; @@ -25,7 +26,13 @@ class BlurScanInterface { public: virtual ~BlurScanInterface() = default; virtual void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd, - uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0; + uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0; + virtual bool canBlur4() { return false; } + virtual void blur4Transpose( + const uint8_t* src, size_t srcStride, const uint8_t* srcEnd, + uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const { + SK_ABORT("This should not be called."); + } }; class PlanningInterface { @@ -49,120 +56,146 @@ public: SK_ABORT("Should never be called."); return nullptr; } -private: }; -// This class is deprecated, and will be replaced by Box. -class PlanBoxInteger final : public PlanningInterface { +// Old slower version of Box which uses 64 bit multiply instead of 32 bit multiple. +// Controlled by SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR +class PlanBox32 final : public PlanningInterface { public: - explicit PlanBoxInteger(double sigma) { - + explicit PlanBox32(double sigma) { // Calculate the radius from sigma. Taken from the old code until something better is // figured out. auto possibleRadius = 1.5 * sigma - 0.5; - auto radius = std::max(1.0, std::ceil(possibleRadius)); - auto window = 2 * radius + 1; + auto radius = std::max(std::numeric_limits<double>::epsilon(), possibleRadius); + auto outerRadius = std::ceil(radius); + auto outerWindow = 2 * outerRadius + 1; + auto outerFactor = (1 - (outerRadius - radius)) / outerWindow; + fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32))); + + auto innerRadius = outerRadius - 1; + auto innerWindow = 2 * innerRadius + 1; + auto innerFactor = (1 - (radius - innerRadius)) / innerWindow; + fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32))); - fWindow = static_cast<size_t>(window); + // Sliding window is defined by the relationship between the outer and inner widows. + // In the single window case, you add the element on the right, and subtract the element on + // the left. But, because two windows are used, this relationship is more complicated; an + // element is added from the right of the outer window, and subtracted from the left of the + // inner window. Because innerWindow = outerWindow - 2, the distance between + // the left and right in the two window case is outerWindow - 1. + fSlidingWindow = static_cast<size_t>(outerWindow - 1); } - size_t bufferSize() const override { return fWindow - 1; } + size_t bufferSize() const override { return 0; } - size_t border() const override { return (fWindow - 1) / 2; } + // Remember that sliding window = window - 1. Therefore, radius = sliding window / 2. + size_t border() const override { return fSlidingWindow / 2; } bool needsBlur() const override { return true; } BlurScanInterface* makeBlurScan( SkArenaAlloc* alloc, size_t width, uint32_t* buffer) const override { - auto weight = static_cast<uint64_t>(round(1.0 / fWindow * (1ull << 32))); - auto bufferEnd = buffer + this->bufferSize(); - size_t noChangeCount = fWindow > width ? fWindow - width : 0; + size_t noChangeCount; + size_t trailingEdgeZeroCount; + + // The relation between the slidingWindow and the width dictates two operating modes. + // * width >= slidingWindow - both sides of the window are contained in the image while + // scanning. Therefore, we assume that slidingWindow zeros are consumed on the trailing + // edge of the window. After this count, then both edges are traversing the image. + // * slidingWindow > width - both sides of the window are off the image while scanning + // the middle. The front edge of the window can only travel width until it falls off the + // image. At this point, both edges of the window are off the image consuming zeros + // and therefore, the destination value does not change. The scan produces unchanged + // values until the trailing edge of the window enters the image. This count is + // slidingWindow - width. + if (width >= fSlidingWindow) { + noChangeCount = 0; + trailingEdgeZeroCount = fSlidingWindow; + } else { + noChangeCount = fSlidingWindow - width; + trailingEdgeZeroCount = width; + } + return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount); - return alloc->make<BoxInteger>(weight, buffer, bufferEnd, noChangeCount); } private: - class BoxInteger final : public BlurScanInterface { + class Box final : public BlurScanInterface { public: - BoxInteger(uint64_t weight, uint32_t* buffer, uint32_t* bufferEnd, size_t noChangeCount) - : fWeight{weight} - , fBuffer0{buffer} - , fBuffer0End{bufferEnd} - , fNoChangeCount{noChangeCount} { } + Box(uint64_t outerWeight, uint64_t innerWeight, + size_t noChangeCount, size_t trailingEdgeZeroCount) + : fOuterWeight{outerWeight} + , fInnerWeight{innerWeight} + , fNoChangeCount{noChangeCount} + , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { } void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd, uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override { - auto buffer0Cursor = fBuffer0; - std::memset(&fBuffer0[0], 0, (fBuffer0End - fBuffer0) * sizeof(*fBuffer0)); - uint32_t sum0 = 0; - - // Consume the source generating pixels. - for (auto srcCursor = src; - srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) { - uint32_t s = *srcCursor; - sum0 += s; + auto rightOuter = src; + auto dstCursor = dst; - *dst = this->finalScale(sum0); + uint32_t outerSum = 0; + uint32_t innerSum = 0; + for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) { + innerSum = outerSum; + outerSum += *rightOuter; + *dstCursor = this->interpolateSums(outerSum, innerSum); - sum0 -= *buffer0Cursor; - *buffer0Cursor = s; - buffer0Cursor = - (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0]; + rightOuter += srcStride; + dstCursor += dstStride; } - // This handles the case when both ends of the box are not between [src, srcEnd), and - // both are zero at that point. + // slidingWindow > width for (size_t i = 0; i < fNoChangeCount; i++) { - uint32_t s = 0; - sum0 += s; - - *dst = this->finalScale(sum0); - - sum0 -= *buffer0Cursor; - *buffer0Cursor = s; - buffer0Cursor = - (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0]; - dst += dstStride; + *dstCursor = this->interpolateSums(outerSum, innerSum);; + dstCursor += dstStride; } - // Starting from the right, fill in the rest of the buffer. - std::memset(&fBuffer0[0], 0, (fBuffer0End - &fBuffer0[0]) * sizeof(fBuffer0[0])); + // width > slidingWindow + auto leftInner = src; + while (rightOuter < srcEnd) { + innerSum = outerSum - *leftInner; + outerSum += *rightOuter; + *dstCursor = this->interpolateSums(outerSum, innerSum); + outerSum -= *leftInner; - sum0 = 0; + rightOuter += srcStride; + leftInner += srcStride; + dstCursor += dstStride; + } - uint8_t* dstCursor = dstEnd; - const uint8_t* srcCursor = srcEnd; - do { + auto leftOuter = srcEnd; + dstCursor = dstEnd; + outerSum = 0; + for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) { + leftOuter -= srcStride; dstCursor -= dstStride; - srcCursor -= srcStride; - uint32_t s = *srcCursor; - sum0 += s; - *dstCursor = this->finalScale(sum0); - - sum0 -= *buffer0Cursor; - *buffer0Cursor = s; - buffer0Cursor = - (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0]; - } while (dstCursor > dst); + innerSum = outerSum; + outerSum += *leftOuter; + *dstCursor = this->interpolateSums(outerSum, innerSum); + } } + private: static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; - uint8_t finalScale(uint32_t sum) const { - return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32); + uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const { + return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32); } - - const uint64_t fWeight; - uint32_t* const fBuffer0; - uint32_t* const fBuffer0End; - const size_t fNoChangeCount; + uint64_t fOuterWeight; + uint64_t fInnerWeight; + size_t fNoChangeCount; + size_t fTrailingEdgeZeroCount; }; - - size_t fWindow; +private: + uint64_t fOuterWeight; + uint64_t fInnerWeight; + size_t fSlidingWindow; }; + class PlanBox final : public PlanningInterface { public: explicit PlanBox(double sigma) { @@ -173,12 +206,12 @@ public: auto outerRadius = std::ceil(radius); auto outerWindow = 2 * outerRadius + 1; auto outerFactor = (1 - (outerRadius - radius)) / outerWindow; - fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32))); + fOuterWeight = static_cast<uint32_t>(round(outerFactor * (1ull << 24))); auto innerRadius = outerRadius - 1; auto innerWindow = 2 * innerRadius + 1; auto innerFactor = (1 - (radius - innerRadius)) / innerWindow; - fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32))); + fInnerWeight = static_cast<uint32_t>(round(innerFactor * (1ull << 24))); // Sliding window is defined by the relationship between the outer and inner widows. // In the single window case, you add the element on the right, and subtract the element on @@ -189,7 +222,9 @@ public: fSlidingWindow = static_cast<size_t>(outerWindow - 1); } - size_t bufferSize() const override { return 0; } + size_t bufferSize() const override { + return fSlidingWindow * (sizeof(Sk4u) / sizeof(uint32_t)); + } // Remember that sliding window = window - 1. Therefore, radius = sliding window / 2. size_t border() const override { return fSlidingWindow / 2; } @@ -219,31 +254,41 @@ public: noChangeCount = fSlidingWindow - width; trailingEdgeZeroCount = width; } - return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount); + Sk4u* sk4uBuffer = reinterpret_cast<Sk4u*>(buffer); + return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount, + sk4uBuffer, sk4uBuffer + fSlidingWindow); } private: class Box final : public BlurScanInterface { public: - Box(uint64_t outerWeight, uint64_t innerWeight, - size_t noChangeCount, size_t trailingEdgeZeroCount) + Box(uint32_t outerWeight, uint32_t innerWeight, + size_t noChangeCount, size_t trailingEdgeZeroCount, + Sk4u* buffer, Sk4u* bufferEnd) : fOuterWeight{outerWeight} , fInnerWeight{innerWeight} , fNoChangeCount{noChangeCount} - , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { } + , fTrailingEdgeZeroCount{trailingEdgeZeroCount} + , fBuffer{buffer} + , fBufferEnd{bufferEnd} { } void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd, - uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override { + uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override { auto rightOuter = src; auto dstCursor = dst; + auto interpolateSums = [this](uint32_t outerSum, uint32_t innerSum) { + return SkTo<uint8_t>( + (fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 24); + }; + uint32_t outerSum = 0; uint32_t innerSum = 0; for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) { innerSum = outerSum; outerSum += *rightOuter; - *dstCursor = this->interpolateSums(outerSum, innerSum); + *dstCursor = interpolateSums(outerSum, innerSum); rightOuter += srcStride; dstCursor += dstStride; @@ -251,7 +296,7 @@ private: // slidingWindow > width for (size_t i = 0; i < fNoChangeCount; i++) { - *dstCursor = this->interpolateSums(outerSum, innerSum);; + *dstCursor = interpolateSums(outerSum, innerSum);; dstCursor += dstStride; } @@ -260,7 +305,7 @@ private: while (rightOuter < srcEnd) { innerSum = outerSum - *leftInner; outerSum += *rightOuter; - *dstCursor = this->interpolateSums(outerSum, innerSum); + *dstCursor = interpolateSums(outerSum, innerSum); outerSum -= *leftInner; rightOuter += srcStride; @@ -277,24 +322,108 @@ private: innerSum = outerSum; outerSum += *leftOuter; - *dstCursor = this->interpolateSums(outerSum, innerSum); + *dstCursor = interpolateSums(outerSum, innerSum); } } - private: - static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; + bool canBlur4() override { return true; } - uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const { - return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32); + // NB this is a transposing scan. The next src is src+1, and the next down is + // src+srcStride. + void blur4Transpose( + const uint8_t* src, size_t srcStride, const uint8_t* srcEnd, + uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override { + auto rightOuter = src; + auto dstCursor = dst; + + Sk4u* const bufferStart = fBuffer; + Sk4u* bufferCursor = bufferStart; + Sk4u* const bufferEnd = fBufferEnd; + + const Sk4u outerWeight(SkTo<uint32_t>(fOuterWeight)); + const Sk4u innerWeight(SkTo<uint32_t>(fInnerWeight)); + + auto load = [](const uint8_t* cursor, size_t stride) -> Sk4u { + return Sk4u(cursor[0*stride], cursor[1*stride], cursor[2*stride], cursor[3*stride]); + }; + + auto interpolateSums = [&] (const Sk4u& outerSum, const Sk4u& innerSum) { + return + SkNx_cast<uint8_t>( + (outerSum * outerWeight + innerSum * innerWeight + kHalf) >> 24); + }; + + Sk4u outerSum = 0; + Sk4u innerSum = 0; + for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) { + innerSum = outerSum; + + Sk4u leadingEdge = load(rightOuter, srcStride); + outerSum += leadingEdge; + Sk4b blurred = interpolateSums(outerSum, innerSum); + blurred.store(dstCursor); + + leadingEdge.store(bufferCursor); + bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart; + + rightOuter += 1; + dstCursor += dstStride; + } + + // slidingWindow > width + for (size_t i = 0; i < fNoChangeCount; i++) { + Sk4b blurred = interpolateSums(outerSum, innerSum); + blurred.store(dstCursor); + dstCursor += dstStride; + } + + // width > slidingWindow + auto leftInner = src; + while (rightOuter < srcEnd) { + Sk4u trailEdge = Sk4u::Load(bufferCursor); + Sk4u leadingEdge = load(rightOuter, srcStride); + innerSum = outerSum - trailEdge; + outerSum += leadingEdge; + + Sk4b blurred = interpolateSums(outerSum, innerSum); + blurred.store(dstCursor); + + outerSum -= trailEdge; + leadingEdge.store(bufferCursor); + bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart; + + rightOuter += 1; + leftInner += 1; + dstCursor += dstStride; + } + + auto leftOuter = srcEnd; + dstCursor = dstEnd; + outerSum = 0; + for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) { + leftOuter -= 1; + dstCursor -= dstStride; + + innerSum = outerSum; + outerSum += load(leftOuter, srcStride); + Sk4b blurred = interpolateSums(outerSum, innerSum); + blurred.store(dstCursor); + } } - uint64_t fOuterWeight; - uint64_t fInnerWeight; - size_t fNoChangeCount; - size_t fTrailingEdgeZeroCount; + + private: + static constexpr uint32_t kHalf = static_cast<uint32_t>(1) << 23; + + const uint32_t fOuterWeight; + const uint32_t fInnerWeight; + const size_t fNoChangeCount; + const size_t fTrailingEdgeZeroCount; + Sk4u* const fBuffer; + Sk4u* const fBufferEnd; }; private: - uint64_t fOuterWeight; - uint64_t fInnerWeight; + uint32_t fOuterWeight; + uint32_t fInnerWeight; size_t fSlidingWindow; }; @@ -421,8 +550,8 @@ public: // Consume the source generating pixels. for (auto srcCursor = src; srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) { - uint32_t s = *srcCursor; - sum0 += s; + uint32_t leadingEdge = *srcCursor; + sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; @@ -437,13 +566,14 @@ public: buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; - *buffer0Cursor = s; + *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; } + // The leading edge is off the right side of the mask. for (size_t i = 0; i < fNoChangeCount; i++) { - uint32_t s = 0; - sum0 += s; + uint32_t leadingEdge = 0; + sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; @@ -458,7 +588,7 @@ public: buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; - *buffer0Cursor = s; + *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; dst += dstStride; @@ -474,8 +604,8 @@ public: while (dstCursor > dst) { dstCursor -= dstStride; srcCursor -= srcStride; - uint32_t s = *srcCursor; - sum0 += s; + uint32_t leadingEdge = *srcCursor; + sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; @@ -490,7 +620,7 @@ public: buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; - *buffer0Cursor = s; + *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; } } @@ -526,8 +656,8 @@ static PlanningInterface* make_plan(SkArenaAlloc* alloc, double sigma) { if (3 * sigma <= 1) { plan = alloc->make<None>(); } else if (sigma < kSmallSigma) { - #if defined(SK_LEGACY_SUPPORT_INTEGER_SMALL_RADII) - plan = alloc->make<PlanBoxInteger>(sigma); + #if defined(SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR) + plan = alloc->make<PlanBox32>(sigma); #else plan = alloc->make<PlanBox>(sigma); #endif @@ -598,21 +728,44 @@ SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { // Blur both directions. size_t tmpW = srcH; size_t tmpH = dstW; + auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH); // Blur horizontally, and transpose. auto scanW = planW->makeBlurScan(&alloc, srcW, buffer); - for (size_t y = 0; y < srcH; y++) { + size_t y = 0; + if (scanW->canBlur4() && srcH > 4) { + for (;y + 4 <= srcH; y += 4) { + auto srcStart = &src.fImage[y * src.fRowBytes]; + auto tmpStart = &tmp[y]; + scanW->blur4Transpose(srcStart, src.fRowBytes, srcStart + srcW, + tmpStart, tmpW, tmpStart + tmpW * tmpH); + } + } + + for (;y < srcH; y++) { auto srcStart = &src.fImage[y * src.fRowBytes]; auto tmpStart = &tmp[y]; - scanW->blur(srcStart, 1, srcStart + srcW, + scanW->blur(srcStart, 1, srcStart + srcW, tmpStart, tmpW, tmpStart + tmpW * tmpH); } + // Blur vertically (scan in memory order because of the transposition), // and transpose back to the original orientation. auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer); - for (size_t y = 0; y < tmpH; y++) { + y = 0; + if (scanH->canBlur4() && tmpH > 4) { + for (;y + 4 <= tmpH; y += 4) { + auto tmpStart = &tmp[y * tmpW]; + auto dstStart = &dst->fImage[y]; + + scanH->blur4Transpose( + tmpStart, tmpW, tmpStart + tmpW, + dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); + } + } + for (;y < tmpH; y++) { auto tmpStart = &tmp[y * tmpW]; auto dstStart = &dst->fImage[y]; diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 4d7aefc93f..b906a02c82 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -564,6 +564,11 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { return vqmovn_u16(vcombine_u16(_16, _16)); } +template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) { + uint16x4_t _16 = vqmovn_u32(src.fVec); + return vqmovn_u16(vcombine_u16(_16, _16)); +} + template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) { return vreinterpretq_s32_u32(vmovl_u16(src.fVec)); } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 415d6e1205..469aefb145 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -165,6 +165,17 @@ public: __m128 fVec; }; +AI static __m128i mullo32(__m128i a, __m128i b) { +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_mullo_epi32(a, b); +#else + __m128i mul20 = _mm_mul_epu32(a, b), + mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), + _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); +#endif +} + template <> class SkNx<4, int32_t> { public: @@ -179,15 +190,10 @@ public: AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } - AI SkNx operator * (const SkNx& o) const { - __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), - mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); - return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), - _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); - } + AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } @@ -254,10 +260,10 @@ public: AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } - // Not quite sure how to best do operator * in SSE2. We probably don't use it. + AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } @@ -465,7 +471,6 @@ public: AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) : fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {} - AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } @@ -612,6 +617,10 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); } +template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) { + return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); +} + template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) { return src.fVec; } |