diff options
-rw-r--r-- | bench/ColorCodecBench.cpp | 57 | ||||
-rw-r--r-- | bench/ColorCodecBench.h | 3 | ||||
-rw-r--r-- | src/core/SkHalf.h | 22 | ||||
-rw-r--r-- | src/core/SkMipMap.cpp | 4 | ||||
-rw-r--r-- | src/core/SkNx.h | 9 | ||||
-rw-r--r-- | src/core/SkXfermodeF16.cpp | 26 | ||||
-rw-r--r-- | src/effects/gradients/Sk4fGradientPriv.h | 6 | ||||
-rw-r--r-- | src/opts/SkColorXform_opts.h | 21 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 11 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 10 | ||||
-rw-r--r-- | tests/Float16Test.cpp | 6 |
11 files changed, 110 insertions, 65 deletions
diff --git a/bench/ColorCodecBench.cpp b/bench/ColorCodecBench.cpp index edad759860..55ec7d9d7b 100644 --- a/bench/ColorCodecBench.cpp +++ b/bench/ColorCodecBench.cpp @@ -16,6 +16,7 @@ DEFINE_bool(qcms, false, "Bench qcms color conversion"); #endif DEFINE_bool(xform_only, false, "Only time the color xform, do not include the decode time"); DEFINE_bool(srgb, false, "Convert to srgb dst space"); +DEFINE_bool(half, false, "Convert to half floats"); ColorCodecBench::ColorCodecBench(const char* name, sk_sp<SkData> encoded) : fEncoded(std::move(encoded)) @@ -43,7 +44,7 @@ void ColorCodecBench::decodeAndXform() { #ifdef SK_DEBUG const SkCodec::Result result = #endif - codec->startScanlineDecode(fInfo); + codec->startScanlineDecode(fSrcInfo); SkASSERT(SkCodec::kSuccess == result); sk_sp<SkColorSpace> srcSpace = sk_ref_sp(codec->getColorSpace()); @@ -54,15 +55,17 @@ void ColorCodecBench::decodeAndXform() { SkASSERT(xform); void* dst = fDst.get(); - for (int y = 0; y < fInfo.height(); y++) { + for (int y = 0; y < fSrcInfo.height(); y++) { #ifdef SK_DEBUG const int rows = #endif codec->getScanlines(fSrc.get(), 1, 0); SkASSERT(1 == rows); - xform->applyTo8888((SkPMColor*) dst, (uint32_t*) fSrc.get(), fInfo.width()); - dst = SkTAddOffset<void>(dst, fInfo.minRowBytes()); + FLAGS_half ? + xform->applyToF16((uint64_t*) dst, (uint32_t*) fSrc.get(), fSrcInfo.width()) : + xform->applyTo8888((SkPMColor*) dst, (uint32_t*) fSrc.get(), fSrcInfo.width()); + dst = SkTAddOffset<void>(dst, fDstInfo.minRowBytes()); } } @@ -72,7 +75,7 @@ void ColorCodecBench::decodeAndXformQCMS() { #ifdef SK_DEBUG const SkCodec::Result result = #endif - codec->startScanlineDecode(fInfo); + codec->startScanlineDecode(fSrcInfo); SkASSERT(SkCodec::kSuccess == result); SkAutoTCallVProc<qcms_profile, qcms_profile_release> @@ -91,15 +94,15 @@ void ColorCodecBench::decodeAndXformQCMS() { #endif void* dst = fDst.get(); - for (int y = 0; y < fInfo.height(); y++) { + for (int y = 0; y < fSrcInfo.height(); y++) { #ifdef SK_DEBUG const int rows = #endif codec->getScanlines(fSrc.get(), 1, 0); SkASSERT(1 == rows); - qcms_transform_data_type(transform, fSrc.get(), dst, fInfo.width(), outType); - dst = SkTAddOffset<void>(dst, fInfo.minRowBytes()); + qcms_transform_data_type(transform, fSrc.get(), dst, fSrcInfo.width(), outType); + dst = SkTAddOffset<void>(dst, fDstInfo.minRowBytes()); } } #endif @@ -114,11 +117,13 @@ void ColorCodecBench::xformOnly() { void* dst = fDst.get(); void* src = fSrc.get(); - for (int y = 0; y < fInfo.height(); y++) { + for (int y = 0; y < fSrcInfo.height(); y++) { // Transform in place - xform->applyTo8888((SkPMColor*) dst, (uint32_t*) src, fInfo.width()); - dst = SkTAddOffset<void>(dst, fInfo.minRowBytes()); - src = SkTAddOffset<void>(src, fInfo.minRowBytes()); + FLAGS_half ? + xform->applyToF16((uint64_t*) dst, (uint32_t*) src, fSrcInfo.width()) : + xform->applyTo8888((SkPMColor*) dst, (uint32_t*) src, fSrcInfo.width()); + dst = SkTAddOffset<void>(dst, fDstInfo.minRowBytes()); + src = SkTAddOffset<void>(src, fSrcInfo.minRowBytes()); } } @@ -141,26 +146,31 @@ void ColorCodecBench::xformOnlyQCMS() { void* dst = fDst.get(); void* src = fSrc.get(); - for (int y = 0; y < fInfo.height(); y++) { + for (int y = 0; y < fSrcInfo.height(); y++) { // Transform in place - qcms_transform_data_type(transform, src, dst, fInfo.width(), outType); - dst = SkTAddOffset<void>(dst, fInfo.minRowBytes()); - src = SkTAddOffset<void>(src, fInfo.minRowBytes()); + qcms_transform_data_type(transform, src, dst, fSrcInfo.width(), outType); + dst = SkTAddOffset<void>(dst, fDstInfo.minRowBytes()); + src = SkTAddOffset<void>(src, fSrcInfo.minRowBytes()); } } #endif void ColorCodecBench::onDelayedSetup() { SkAutoTDelete<SkCodec> codec(SkCodec::NewFromData(fEncoded.get())); - fInfo = codec->getInfo().makeColorType(kRGBA_8888_SkColorType); + fSrcInfo = codec->getInfo().makeColorType(kRGBA_8888_SkColorType); + + fDstInfo = fSrcInfo; + if (FLAGS_half) { + fDstInfo = fDstInfo.makeColorType(kRGBA_F16_SkColorType); + } + fDst.reset(fDstInfo.getSafeSize(fDstInfo.minRowBytes())); - fDst.reset(fInfo.getSafeSize(fInfo.minRowBytes())); if (FLAGS_xform_only) { - fSrc.reset(fInfo.getSafeSize(fInfo.minRowBytes())); - codec->getPixels(fInfo, fSrc.get(), fInfo.minRowBytes()); + fSrc.reset(fSrcInfo.getSafeSize(fSrcInfo.minRowBytes())); + codec->getPixels(fSrcInfo, fSrc.get(), fSrcInfo.minRowBytes()); } else { // Set-up a row buffer to decode into before transforming to dst. - fSrc.reset(fInfo.minRowBytes()); + fSrc.reset(fSrcInfo.minRowBytes()); } fSrcData = codec->getICCData(); @@ -189,6 +199,11 @@ void ColorCodecBench::onDelayedSetup() { } void ColorCodecBench::onDraw(int n, SkCanvas*) { + if ((FLAGS_srgb || FLAGS_qcms) && FLAGS_half) { + SkDebugf("Error: Contradicting flags.\n"); + return; + } + for (int i = 0; i < n; i++) { #if defined(SK_TEST_QCMS) if (FLAGS_qcms) { diff --git a/bench/ColorCodecBench.h b/bench/ColorCodecBench.h index 9da505fd47..45bf41ec2d 100644 --- a/bench/ColorCodecBench.h +++ b/bench/ColorCodecBench.h @@ -36,7 +36,8 @@ private: SkString fName; sk_sp<SkData> fEncoded; - SkImageInfo fInfo; + SkImageInfo fSrcInfo; + SkImageInfo fDstInfo; SkAutoMalloc fDst; SkAutoMalloc fSrc; sk_sp<SkColorSpace> fDstSpace; diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h index 2f2ed66c6a..adf8d3a126 100644 --- a/src/core/SkHalf.h +++ b/src/core/SkHalf.h @@ -16,9 +16,10 @@ // only used for storage typedef uint16_t SkHalf; -#define SK_HalfMin 0x0400 // 2^-24 (minimum positive normal value) -#define SK_HalfMax 0x7bff // 65504 -#define SK_HalfEpsilon 0x1400 // 2^-10 +static constexpr uint16_t SK_HalfMin = 0x0400; // 2^-24 (minimum positive normal value) +static constexpr uint16_t SK_HalfMax = 0x7bff; // 65504 +static constexpr uint16_t SK_HalfEpsilon = 0x1400; // 2^-10 +static constexpr uint16_t SK_Half1 = 0x3C00; // 1 // convert between half and single precision floating point float SkHalfToFloat(SkHalf h); @@ -26,8 +27,8 @@ SkHalf SkFloatToHalf(float f); // Convert between half and single precision floating point, // assuming inputs and outputs are both finite. -static inline Sk4f SkHalfToFloat_finite(uint64_t); -static inline uint64_t SkFloatToHalf_finite(const Sk4f&); +static inline Sk4f SkHalfToFloat_finite(uint64_t); +static inline Sk4h SkFloatToHalf_finite(const Sk4f&); // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ // @@ -65,14 +66,12 @@ static inline Sk4f SkHalfToFloat_finite(uint64_t hs) { #endif } -static inline uint64_t SkFloatToHalf_finite(const Sk4f& fs) { - uint64_t r; +static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) { #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64) float32x4_t vec = fs.fVec; asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec) - "fmov %[r], %d[vec] \n" // vst1_f16(&r, ...) - : [r] "=r" (r) // =r: write-only 64-bit general register - , [vec] "+w" (vec)); // +w: read-write NEON register + : [vec] "+w" (vec)); // +w: read-write NEON register + return vreinterpret_u16_f32(vget_low_f32(vec)); #else Sk4i bits = Sk4i::Load(&fs), sign = bits & 0x80000000, // Save the sign bit for later... @@ -91,9 +90,8 @@ static inline uint64_t SkFloatToHalf_finite(const Sk4f& fs) { Sk4i denorm = Sk4i::Load(&plus_K) ^ K; Sk4i merged = (sign >> 16) | will_be_denorm.thenElse(denorm, norm); - SkNx_cast<uint16_t>(merged).store(&r); + return SkNx_cast<uint16_t>(merged); #endif - return r; } #endif diff --git a/src/core/SkMipMap.cpp b/src/core/SkMipMap.cpp index 4811c9e073..cb9cc85a98 100644 --- a/src/core/SkMipMap.cpp +++ b/src/core/SkMipMap.cpp @@ -88,7 +88,9 @@ struct ColorTypeFilter_F16 { return SkHalfToFloat_finite(x); } static uint64_t Compact(const Sk4f& x) { - return SkFloatToHalf_finite(x); + uint64_t r; + SkFloatToHalf_finite(x).store(&r); + return r; } }; diff --git a/src/core/SkNx.h b/src/core/SkNx.h index dec63f8d89..253fcf22fe 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -309,6 +309,15 @@ SI Sk4i Sk4f_round(const Sk4f& x) { (int) lrintf (x[3]), }; } +// Transpose 4 Sk4h and store (256 bits total). +SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) { + uint64_t* dst64 = (uint64_t*) dst; + Sk4h(r[0], g[0], b[0], a[0]).store(dst64 + 0); + Sk4h(r[1], g[1], b[1], a[1]).store(dst64 + 1); + Sk4h(r[2], g[2], b[2], a[2]).store(dst64 + 2); + Sk4h(r[3], g[3], b[3], a[3]).store(dst64 + 3); +} + #endif SI void Sk4f_ToBytes(uint8_t p[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { diff --git a/src/core/SkXfermodeF16.cpp b/src/core/SkXfermodeF16.cpp index 63058f9dce..219e91188e 100644 --- a/src/core/SkXfermodeF16.cpp +++ b/src/core/SkXfermodeF16.cpp @@ -25,13 +25,13 @@ static void xfer_1(const SkXfermode* xfer, uint64_t dst[], const SkPM4f* src, in Sk4f d4 = SkHalfToFloat_finite(dst[i]); d4.store(d.fVec); Sk4f r4 = Sk4f::Load(proc(*src, d).fVec); - dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])); + SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]); } } else { for (int i = 0; i < count; ++i) { SkHalfToFloat_finite(dst[i]).store(d.fVec); Sk4f r4 = Sk4f::Load(proc(*src, d).fVec); - dst[i] = SkFloatToHalf_finite(r4); + SkFloatToHalf_finite(r4).store(&dst[i]); } } } @@ -45,13 +45,13 @@ static void xfer_n(const SkXfermode* xfer, uint64_t dst[], const SkPM4f src[], i Sk4f d4 = SkHalfToFloat_finite(dst[i]); d4.store(d.fVec); Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec); - dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])); + SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]); } } else { for (int i = 0; i < count; ++i) { SkHalfToFloat_finite(dst[i]).store(d.fVec); Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec); - dst[i] = SkFloatToHalf_finite(r4); + SkFloatToHalf_finite(r4).store(&dst[i]); } } } @@ -65,7 +65,7 @@ static void clear(const SkXfermode*, uint64_t dst[], const SkPM4f*, int count, c for (int i = 0; i < count; ++i) { if (aa[i]) { const Sk4f d4 = SkHalfToFloat_finite(dst[i]); - dst[i] = SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255)); + SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255)).store(&dst[i]); } } } else { @@ -83,10 +83,12 @@ static void src_1(const SkXfermode*, uint64_t dst[], const SkPM4f* src, int coun if (aa) { for (int i = 0; i < count; ++i) { const Sk4f d4 = SkHalfToFloat_finite(dst[i]); - dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])); + SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]); } } else { - sk_memset64(dst, SkFloatToHalf_finite(s4), count); + uint64_t s4h; + SkFloatToHalf_finite(s4).store(&s4h); + sk_memset64(dst, s4h, count); } } @@ -96,12 +98,12 @@ static void src_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int cou for (int i = 0; i < count; ++i) { const Sk4f s4 = Sk4f::Load(src[i].fVec); const Sk4f d4 = SkHalfToFloat_finite(dst[i]); - dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])); + SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]); } } else { for (int i = 0; i < count; ++i) { const Sk4f s4 = Sk4f::Load(src[i].fVec); - dst[i] = SkFloatToHalf_finite(s4); + SkFloatToHalf_finite(s4).store(&dst[i]); } } } @@ -124,9 +126,9 @@ static void srcover_1(const SkXfermode*, uint64_t dst[], const SkPM4f* src, int const Sk4f d4 = SkHalfToFloat_finite(dst[i]); const Sk4f r4 = s4 + d4 * dst_scale; if (aa) { - dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])); + SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]); } else { - dst[i] = SkFloatToHalf_finite(r4); + SkFloatToHalf_finite(r4).store(&dst[i]); } } } @@ -140,7 +142,7 @@ static void srcover_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int if (aa) { r = lerp_by_coverage(r, d, aa[i]); } - dst[i] = SkFloatToHalf_finite(r); + SkFloatToHalf_finite(r).store(&dst[i]); } } diff --git a/src/effects/gradients/Sk4fGradientPriv.h b/src/effects/gradients/Sk4fGradientPriv.h index 68e95a63d9..9745119fd4 100644 --- a/src/effects/gradients/Sk4fGradientPriv.h +++ b/src/effects/gradients/Sk4fGradientPriv.h @@ -143,11 +143,13 @@ struct DstTraits<DstType::F16, premul> { } static void store(const Sk4f& c, Type* dst) { - *dst = SkFloatToHalf_finite(PM::apply(c)); + SkFloatToHalf_finite(PM::apply(c)).store(dst); } static void store(const Sk4f& c, Type* dst, int n) { - sk_memset64(dst, SkFloatToHalf_finite(PM::apply(c)), n); + uint64_t color; + SkFloatToHalf_finite(PM::apply(c)).store(&color); + sk_memset64(dst, color, n); } static void store4x(const Sk4f& c0, const Sk4f& c1, diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h index e7a2b45946..3bb11f5599 100644 --- a/src/opts/SkColorXform_opts.h +++ b/src/opts/SkColorXform_opts.h @@ -126,16 +126,10 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len, dst = SkTAddOffset<void>(dst, 4 * sizeof(uint32_t)); } else { - // FIXME (msarett): - // Can we do better here? Should we store half floats as planar? - // Should we write Intel/Arm specific code? Should we add a transpose - // function to SkNx? Should we rewrite the algorithm to be interleaved? - uint64_t* dst64 = (uint64_t*) dst; - dst64[0] = SkFloatToHalf_finite(Sk4f(dstReds[0], dstGreens[0], dstBlues[0], 1.0f)); - dst64[1] = SkFloatToHalf_finite(Sk4f(dstReds[1], dstGreens[1], dstBlues[1], 1.0f)); - dst64[2] = SkFloatToHalf_finite(Sk4f(dstReds[2], dstGreens[2], dstBlues[2], 1.0f)); - dst64[3] = SkFloatToHalf_finite(Sk4f(dstReds[3], dstGreens[3], dstBlues[3], 1.0f)); - + Sk4h_store4(dst, SkFloatToHalf_finite(dstReds), + SkFloatToHalf_finite(dstGreens), + SkFloatToHalf_finite(dstBlues), + SK_Half1); dst = SkTAddOffset<void>(dst, 4 * sizeof(uint64_t)); } }; @@ -185,10 +179,9 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len, dst = SkTAddOffset<void>(dst, sizeof(uint32_t)); } else { - uint64_t rgba = SkFloatToHalf_finite(dstPixel); - - // Set alpha to 1.0 - rgba |= 0x3C00000000000000; + uint64_t rgba; + SkFloatToHalf_finite(dstPixel).store(&rgba); + rgba |= static_cast<uint64_t>(SK_Half1) << 48; *((uint64_t*) dst) = rgba; dst = SkTAddOffset<void>(dst, sizeof(uint64_t)); } diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 2f73e0368d..53e95eb97f 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -483,4 +483,15 @@ static inline Sk4i Sk4f_round(const Sk4f& x) { return vcvtq_s32_f32((x + 0.5f).fVec); } +static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, + const Sk4h& a) { + uint16x4x4_t rgba = {{ + r.fVec, + g.fVec, + b.fVec, + a.fVec, + }}; + vst4_u16((uint16_t*) dst, rgba); +} + #endif//SkNx_neon_DEFINED diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 553ff1de3e..c0e48287b4 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -403,4 +403,14 @@ static inline Sk4i Sk4f_round(const Sk4f& x) { return _mm_cvtps_epi32(x.fVec); } +static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, + const Sk4h& a) { + __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); + __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); + __m128i lo = _mm_unpacklo_epi32(rg, ba); + __m128i hi = _mm_unpackhi_epi32(rg, ba); + _mm_storeu_si128(((__m128i*) dst) + 0, lo); + _mm_storeu_si128(((__m128i*) dst) + 1, hi); +} + #endif//SkNx_sse_DEFINED diff --git a/tests/Float16Test.cpp b/tests/Float16Test.cpp index 2091652522..8ab56048e9 100644 --- a/tests/Float16Test.cpp +++ b/tests/Float16Test.cpp @@ -73,7 +73,9 @@ DEF_TEST(HalfToFloat_finite, r) { u(f), f); } REPORTER_ASSERT(r, SkHalfToFloat_finite(h)[0] == f); - REPORTER_ASSERT(r, SkFloatToHalf_finite(SkHalfToFloat_finite(h)) == h); + uint64_t result; + SkFloatToHalf_finite(SkHalfToFloat_finite(h)).store(&result); + REPORTER_ASSERT(r, result == h); } } } @@ -89,7 +91,7 @@ DEF_TEST(FloatToHalf_finite, r) { float f; memcpy(&f, &bits, 4); if (isfinite(f) && isfinite(SkHalfToFloat(SkFloatToHalf(f)))) { - uint16_t h1 = (uint16_t)SkFloatToHalf_finite(Sk4f(f,0,0,0)), + uint16_t h1 = SkFloatToHalf_finite(Sk4f(f,0,0,0))[0], h2 = SkFloatToHalf(f); bool ok = (h1 == h2 || h1 == h2-1); REPORTER_ASSERT(r, ok); |