diff options
author | msarett <msarett@google.com> | 2016-06-08 14:43:53 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-06-08 14:43:53 -0700 |
commit | a9e878c836994bce695274b4c28890290139dcdf (patch) | |
tree | 0bb9f71f6606514ae0d291c03f0893682f5bd42e | |
parent | dd40483534f57df206020c174cb56e2b2c258015 (diff) |
Optimize color xforms with 2.2 gammas for SSE2
Because we recognize commonly used gamma tables and
parameters as 2.2f, about 98% of jpegs with color profiles
will pass through this xform (assuming the dst is also
2.2f). Sample size is 10,322 jpegs.
I won't go crazy with performance numbers because this is
a work in progress, particularly in terms of correctness.
201295.jpg on HP z620
(300x280, most common form of sRGB profile)
Decode Time + QCMS Xform 1.28 ms
QCMS Xform Only 0.495 ms
Decode Time + Skia Opt Xform 1.01 ms
Skia Opt Xform Only 0.235 ms
Decode Time + Xform Speed-up 1.27x
Xform Only Speed-up 2.11x
FWIW, Skia xform time before these optimizations was
41.1 ms. But we expected that code to be slow.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2046013002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2046013002
-rw-r--r-- | bench/ColorCodecBench.cpp | 4 | ||||
-rw-r--r-- | src/core/SkColorSpaceXform.cpp | 112 | ||||
-rw-r--r-- | src/core/SkColorSpaceXform.h | 8 | ||||
-rw-r--r-- | src/core/SkOpts.cpp | 4 | ||||
-rw-r--r-- | src/core/SkOpts.h | 5 | ||||
-rw-r--r-- | src/opts/SkColorXform_opts.h | 181 |
6 files changed, 240 insertions, 74 deletions
diff --git a/bench/ColorCodecBench.cpp b/bench/ColorCodecBench.cpp index 85785d864c..bfbea62c41 100644 --- a/bench/ColorCodecBench.cpp +++ b/bench/ColorCodecBench.cpp @@ -172,6 +172,10 @@ void ColorCodecBench::onDelayedSetup() { if (FLAGS_qcms) { fDstSpaceQCMS.reset(qcms_profile_from_memory(dstData->data(), dstData->size())); SkASSERT(fDstSpaceQCMS); + + // This call takes a non-trivial amount of time, but I think it's the most fair to + // treat it as overhead. It only needs to happen once. + qcms_profile_precache_output_transform(fDstSpaceQCMS); } else #endif { diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp index 216e9931ba..0faff88286 100644 --- a/src/core/SkColorSpaceXform.cpp +++ b/src/core/SkColorSpaceXform.cpp @@ -8,6 +8,7 @@ #include "SkColorPriv.h" #include "SkColorSpace_Base.h" #include "SkColorSpaceXform.h" +#include "SkOpts.h" static inline bool compute_gamut_xform(SkMatrix44* srcToDst, const SkMatrix44& srcToXYZ, const SkMatrix44& dstToXYZ) { @@ -36,18 +37,10 @@ std::unique_ptr<SkColorSpaceXform> SkColorSpaceXform::New(const sk_sp<SkColorSpa return nullptr; } - if (as_CSB(srcSpace)->gammas()->isValues() && as_CSB(dstSpace)->gammas()->isValues()) { - float srcGammas[3]; - float dstGammas[3]; - srcGammas[0] = as_CSB(srcSpace)->gammas()->fRed.fValue; - srcGammas[1] = as_CSB(srcSpace)->gammas()->fGreen.fValue; - srcGammas[2] = as_CSB(srcSpace)->gammas()->fBlue.fValue; - dstGammas[0] = 1.0f / as_CSB(dstSpace)->gammas()->fRed.fValue; - dstGammas[1] = 1.0f / as_CSB(dstSpace)->gammas()->fGreen.fValue; - dstGammas[2] = 1.0f / as_CSB(dstSpace)->gammas()->fBlue.fValue; - - return std::unique_ptr<SkColorSpaceXform>( - new SkGammaByValueXform(srcGammas, srcToDst, dstGammas)); + if (SkColorSpace::k2Dot2Curve_GammaNamed == srcSpace->gammaNamed() && + SkColorSpace::k2Dot2Curve_GammaNamed == dstSpace->gammaNamed()) + { + return std::unique_ptr<SkColorSpaceXform>(new Sk2Dot2Xform(srcToDst)); } return std::unique_ptr<SkColorSpaceXform>( @@ -56,72 +49,53 @@ std::unique_ptr<SkColorSpaceXform> SkColorSpaceXform::New(const sk_sp<SkColorSpa /////////////////////////////////////////////////////////////////////////////////////////////////// +Sk2Dot2Xform::Sk2Dot2Xform(const SkMatrix44& srcToDst) +{ + // Build row major 4x4 matrix: + // rX gX bX 0 + // rY gY bY 0 + // rZ gZ bZ 0 + // rQ gQ bQ 0 + fSrcToDst[0] = srcToDst.getFloat(0, 0); + fSrcToDst[1] = srcToDst.getFloat(0, 1); + fSrcToDst[2] = srcToDst.getFloat(0, 2); + fSrcToDst[3] = 0.0f; + fSrcToDst[4] = srcToDst.getFloat(1, 0); + fSrcToDst[5] = srcToDst.getFloat(1, 1); + fSrcToDst[6] = srcToDst.getFloat(1, 2); + fSrcToDst[7] = 0.0f; + fSrcToDst[8] = srcToDst.getFloat(2, 0); + fSrcToDst[9] = srcToDst.getFloat(2, 1); + fSrcToDst[10] = srcToDst.getFloat(2, 2); + fSrcToDst[11] = 0.0f; + fSrcToDst[12] = srcToDst.getFloat(3, 0); + fSrcToDst[13] = srcToDst.getFloat(3, 1); + fSrcToDst[14] = srcToDst.getFloat(3, 2); + fSrcToDst[15] = 0.0f; +} + +void Sk2Dot2Xform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const { + SkOpts::color_xform_2Dot2_RGBA_to_8888(dst, src, len, fSrcToDst); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + static inline float byte_to_float(uint8_t v) { return ((float) v) * (1.0f / 255.0f); } -static inline uint8_t clamp_float_to_byte(float v) { +// Expand range from 0-1 to 0-255, then convert. +static inline uint8_t clamp_normalized_float_to_byte(float v) { v = v * 255.0f; - if (v > 255.0f) { + if (v >= 254.5f) { return 255; - } else if (v <= 0.0f) { + } else if (v < 0.5f) { return 0; } else { return (uint8_t) (v + 0.5f); } } -/////////////////////////////////////////////////////////////////////////////////////////////////// - -SkGammaByValueXform::SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst, - float dstGammas[3]) - : fSrcToDst(srcToDst) -{ - memcpy(fSrcGammas, srcGammas, 3 * sizeof(float)); - memcpy(fDstGammas, dstGammas, 3 * sizeof(float)); -} - -void SkGammaByValueXform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const { - while (len-- > 0) { - float srcFloats[3]; - srcFloats[0] = byte_to_float((*src >> 0) & 0xFF); - srcFloats[1] = byte_to_float((*src >> 8) & 0xFF); - srcFloats[2] = byte_to_float((*src >> 16) & 0xFF); - - // Convert to linear. - srcFloats[0] = pow(srcFloats[0], fSrcGammas[0]); - srcFloats[1] = pow(srcFloats[1], fSrcGammas[1]); - srcFloats[2] = pow(srcFloats[2], fSrcGammas[2]); - - // Convert to dst gamut. - float dstFloats[3]; - dstFloats[0] = srcFloats[0] * fSrcToDst.getFloat(0, 0) + - srcFloats[1] * fSrcToDst.getFloat(1, 0) + - srcFloats[2] * fSrcToDst.getFloat(2, 0) + fSrcToDst.getFloat(3, 0); - dstFloats[1] = srcFloats[0] * fSrcToDst.getFloat(0, 1) + - srcFloats[1] * fSrcToDst.getFloat(1, 1) + - srcFloats[2] * fSrcToDst.getFloat(2, 1) + fSrcToDst.getFloat(3, 1); - dstFloats[2] = srcFloats[0] * fSrcToDst.getFloat(0, 2) + - srcFloats[1] * fSrcToDst.getFloat(1, 2) + - srcFloats[2] * fSrcToDst.getFloat(2, 2) + fSrcToDst.getFloat(3, 2); - - // Convert to dst gamma. - dstFloats[0] = pow(dstFloats[0], fDstGammas[0]); - dstFloats[1] = pow(dstFloats[1], fDstGammas[1]); - dstFloats[2] = pow(dstFloats[2], fDstGammas[2]); - - *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), - clamp_float_to_byte(dstFloats[0]), - clamp_float_to_byte(dstFloats[1]), - clamp_float_to_byte(dstFloats[2])); - - dst++; - src++; - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - // Interpolating lookup in a variably sized table. static inline float interp_lut(uint8_t byte, float* table, size_t tableSize) { float index = byte_to_float(byte) * (tableSize - 1); @@ -261,9 +235,9 @@ void SkDefaultXform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_ } *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), - clamp_float_to_byte(dstFloats[0]), - clamp_float_to_byte(dstFloats[1]), - clamp_float_to_byte(dstFloats[2])); + clamp_normalized_float_to_byte(dstFloats[0]), + clamp_normalized_float_to_byte(dstFloats[1]), + clamp_normalized_float_to_byte(dstFloats[2])); dst++; src++; diff --git a/src/core/SkColorSpaceXform.h b/src/core/SkColorSpaceXform.h index d54d1b812d..3472643519 100644 --- a/src/core/SkColorSpaceXform.h +++ b/src/core/SkColorSpaceXform.h @@ -34,17 +34,15 @@ public: virtual ~SkColorSpaceXform() {} }; -class SkGammaByValueXform : public SkColorSpaceXform { +class Sk2Dot2Xform : public SkColorSpaceXform { public: void xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const override; private: - SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst, float dstGammas[3]); + Sk2Dot2Xform(const SkMatrix44& srcToDst); - float fSrcGammas[3]; - const SkMatrix44 fSrcToDst; - float fDstGammas[3]; + float fSrcToDst[16]; friend class SkColorSpaceXform; }; diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 6d3e6c9fea..8dec3fad0f 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -16,6 +16,7 @@ #include "SkBlitRow_opts.h" #include "SkBlurImageFilter_opts.h" #include "SkColorCubeFilter_opts.h" +#include "SkColorXform_opts.h" #include "SkMorphologyImageFilter_opts.h" #include "SkSwizzler_opts.h" #include "SkTextureCompressor_opts.h" @@ -76,6 +77,9 @@ namespace SkOpts { decltype(srcover_srgb_srgb) srcover_srgb_srgb = sk_default::srcover_srgb_srgb; + decltype(color_xform_2Dot2_RGBA_to_8888) color_xform_2Dot2_RGBA_to_8888 = + sk_default::color_xform_2Dot2_RGBA_to_8888; + // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp. void Init_ssse3(); void Init_sse41(); diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index b8aea4aa34..0711471fb4 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -68,6 +68,11 @@ namespace SkOpts { // Blend ndst src pixels over dst, where both src and dst point to sRGB pixels (RGBA or BGRA). // If nsrc < ndst, we loop over src to create a pattern. extern void (*srcover_srgb_srgb)(uint32_t* dst, const uint32_t* src, int ndst, int nsrc); + + // Color xform RGBA input into SkPMColor ordered 8888 pixels. Does not premultiply, and + // assumes src and dst gamma curves are both 2.2f exponentials. + extern void (*color_xform_2Dot2_RGBA_to_8888)(uint32_t* dst, const uint32_t* src, int len, + const float srcToDstMatrix[16]); } #endif//SkOpts_DEFINED diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h new file mode 100644 index 0000000000..3fc620f014 --- /dev/null +++ b/src/opts/SkColorXform_opts.h @@ -0,0 +1,181 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkColorXform_opts_DEFINED +#define SkColorXform_opts_DEFINED + +#include "SkColorPriv.h" + +namespace SK_OPTS_NS { + +static uint8_t clamp_float_to_byte(float v) { + if (v >= 254.5f) { + return 255; + } else if (v < 0.5f) { + return 0; + } else { + return (uint8_t) (v + 0.5f); + } +} + +static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_t* src, int len, + const float matrix[16]) { + while (len-- > 0) { + float srcFloats[3]; + srcFloats[0] = (float) ((*src >> 0) & 0xFF); + srcFloats[1] = (float) ((*src >> 8) & 0xFF); + srcFloats[2] = (float) ((*src >> 16) & 0xFF); + + // Convert to linear. + // TODO (msarett): + // We should use X^2.2 here instead of X^2. What is the impact on correctness? + // We should be able to get closer to 2.2 at a small performance cost. + srcFloats[0] = srcFloats[0] * srcFloats[0]; + srcFloats[1] = srcFloats[1] * srcFloats[1]; + srcFloats[2] = srcFloats[2] * srcFloats[2]; + + // Convert to dst gamut. + float dstFloats[3]; + // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost always zero. + // Should we have another optimized path that avoids the extra addition when they + // are zero? + dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + + srcFloats[2] * matrix[8] + matrix[12]; + dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + + srcFloats[2] * matrix[9] + matrix[13]; + dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + + srcFloats[2] * matrix[10] + matrix[14]; + + // Convert to dst gamma. + // TODO (msarett): + // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness? + // We should be able to get closer to (1/2.2) at a small performance cost. + dstFloats[0] = sqrtf(dstFloats[0]); + dstFloats[1] = sqrtf(dstFloats[1]); + dstFloats[2] = sqrtf(dstFloats[2]); + + *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), + clamp_float_to_byte(dstFloats[0]), + clamp_float_to_byte(dstFloats[1]), + clamp_float_to_byte(dstFloats[2])); + + dst++; + src++; + } +} + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + +static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len, + const float matrix[16]) { + // Load transformation matrix. + __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); + __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); + __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); + __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); + + while (len >= 4) { + // Load 4 pixels and convert them to floats. + __m128i rgba = _mm_loadu_si128((const __m128i*) src); + __m128i byteMask = _mm_set1_epi32(0xFF); + __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); + __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); + __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); + + // Convert to linear. + // FIXME (msarett): + // Should we be more accurate? + reds = _mm_mul_ps(reds, reds); + greens = _mm_mul_ps(greens, greens); + blues = _mm_mul_ps(blues, blues); + + // Apply the transformation matrix to dst gamut. + // FIXME (msarett): + // rQ, gQ, and bQ are almost always zero. Can we save a couple instructions? + + // Splat rX, rY, rZ, and rQ each across a register. + __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); + __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); + __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); + __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); + + // dstReds = rX * reds + rY * greens + rZ * blues + rQ + __m128 dstReds = _mm_mul_ps(reds, rX); + dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); + dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); + dstReds = _mm_add_ps(dstReds, rQ); + + // Splat gX, gY, gZ, and gQ each across a register. + __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); + __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); + __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); + __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); + + // dstGreens = gX * reds + gY * greens + gZ * blues + gQ + __m128 dstGreens = _mm_mul_ps(reds, gX); + dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); + dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); + dstGreens = _mm_add_ps(dstGreens, gQ); + + // Splat bX, bY, bZ, and bQ each across a register. + __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); + __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); + __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); + __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); + + // dstBlues = bX * reds + bY * greens + bZ * blues + bQ + __m128 dstBlues = _mm_mul_ps(reds, bX); + dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); + dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); + dstBlues = _mm_add_ps(dstBlues, bQ); + + // Convert to dst gamma. + // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. + // FIXME (msarett): + // Should we be more accurate? + dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); + dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); + dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); + + // Clamp floats to 0-255 range. + dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ps(255.0f))); + dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ps(255.0f))); + dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ps(255.0f))); + + // Convert to bytes and store to memory. + rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); +#ifdef SK_PMCOLOR_IS_RGBA + rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)); + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)); +#else + rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) ); + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)); + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16)); +#endif + _mm_storeu_si128((__m128i*) dst, rgba); + + dst += 4; + src += 4; + len -= 4; + } + + color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); +} + +#else + +static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len, + const float matrix[16]) { + color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); +} + +#endif + +} + +#endif // SkColorXform_opts_DEFINED |