aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar msarett <msarett@google.com>2016-06-08 14:43:53 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-06-08 14:43:53 -0700
commita9e878c836994bce695274b4c28890290139dcdf (patch)
tree0bb9f71f6606514ae0d291c03f0893682f5bd42e
parentdd40483534f57df206020c174cb56e2b2c258015 (diff)
Optimize color xforms with 2.2 gammas for SSE2
Because we recognize commonly used gamma tables and parameters as 2.2f, about 98% of jpegs with color profiles will pass through this xform (assuming the dst is also 2.2f). Sample size is 10,322 jpegs. I won't go crazy with performance numbers because this is a work in progress, particularly in terms of correctness. 201295.jpg on HP z620 (300x280, most common form of sRGB profile) Decode Time + QCMS Xform 1.28 ms QCMS Xform Only 0.495 ms Decode Time + Skia Opt Xform 1.01 ms Skia Opt Xform Only 0.235 ms Decode Time + Xform Speed-up 1.27x Xform Only Speed-up 2.11x FWIW, Skia xform time before these optimizations was 41.1 ms. But we expected that code to be slow. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2046013002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2046013002
-rw-r--r--bench/ColorCodecBench.cpp4
-rw-r--r--src/core/SkColorSpaceXform.cpp112
-rw-r--r--src/core/SkColorSpaceXform.h8
-rw-r--r--src/core/SkOpts.cpp4
-rw-r--r--src/core/SkOpts.h5
-rw-r--r--src/opts/SkColorXform_opts.h181
6 files changed, 240 insertions, 74 deletions
diff --git a/bench/ColorCodecBench.cpp b/bench/ColorCodecBench.cpp
index 85785d864c..bfbea62c41 100644
--- a/bench/ColorCodecBench.cpp
+++ b/bench/ColorCodecBench.cpp
@@ -172,6 +172,10 @@ void ColorCodecBench::onDelayedSetup() {
if (FLAGS_qcms) {
fDstSpaceQCMS.reset(qcms_profile_from_memory(dstData->data(), dstData->size()));
SkASSERT(fDstSpaceQCMS);
+
+ // This call takes a non-trivial amount of time, but I think it's the most fair to
+ // treat it as overhead. It only needs to happen once.
+ qcms_profile_precache_output_transform(fDstSpaceQCMS);
} else
#endif
{
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index 216e9931ba..0faff88286 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -8,6 +8,7 @@
#include "SkColorPriv.h"
#include "SkColorSpace_Base.h"
#include "SkColorSpaceXform.h"
+#include "SkOpts.h"
static inline bool compute_gamut_xform(SkMatrix44* srcToDst, const SkMatrix44& srcToXYZ,
const SkMatrix44& dstToXYZ) {
@@ -36,18 +37,10 @@ std::unique_ptr<SkColorSpaceXform> SkColorSpaceXform::New(const sk_sp<SkColorSpa
return nullptr;
}
- if (as_CSB(srcSpace)->gammas()->isValues() && as_CSB(dstSpace)->gammas()->isValues()) {
- float srcGammas[3];
- float dstGammas[3];
- srcGammas[0] = as_CSB(srcSpace)->gammas()->fRed.fValue;
- srcGammas[1] = as_CSB(srcSpace)->gammas()->fGreen.fValue;
- srcGammas[2] = as_CSB(srcSpace)->gammas()->fBlue.fValue;
- dstGammas[0] = 1.0f / as_CSB(dstSpace)->gammas()->fRed.fValue;
- dstGammas[1] = 1.0f / as_CSB(dstSpace)->gammas()->fGreen.fValue;
- dstGammas[2] = 1.0f / as_CSB(dstSpace)->gammas()->fBlue.fValue;
-
- return std::unique_ptr<SkColorSpaceXform>(
- new SkGammaByValueXform(srcGammas, srcToDst, dstGammas));
+ if (SkColorSpace::k2Dot2Curve_GammaNamed == srcSpace->gammaNamed() &&
+ SkColorSpace::k2Dot2Curve_GammaNamed == dstSpace->gammaNamed())
+ {
+ return std::unique_ptr<SkColorSpaceXform>(new Sk2Dot2Xform(srcToDst));
}
return std::unique_ptr<SkColorSpaceXform>(
@@ -56,72 +49,53 @@ std::unique_ptr<SkColorSpaceXform> SkColorSpaceXform::New(const sk_sp<SkColorSpa
///////////////////////////////////////////////////////////////////////////////////////////////////
+Sk2Dot2Xform::Sk2Dot2Xform(const SkMatrix44& srcToDst)
+{
+ // Build row major 4x4 matrix:
+ // rX gX bX 0
+ // rY gY bY 0
+ // rZ gZ bZ 0
+ // rQ gQ bQ 0
+ fSrcToDst[0] = srcToDst.getFloat(0, 0);
+ fSrcToDst[1] = srcToDst.getFloat(0, 1);
+ fSrcToDst[2] = srcToDst.getFloat(0, 2);
+ fSrcToDst[3] = 0.0f;
+ fSrcToDst[4] = srcToDst.getFloat(1, 0);
+ fSrcToDst[5] = srcToDst.getFloat(1, 1);
+ fSrcToDst[6] = srcToDst.getFloat(1, 2);
+ fSrcToDst[7] = 0.0f;
+ fSrcToDst[8] = srcToDst.getFloat(2, 0);
+ fSrcToDst[9] = srcToDst.getFloat(2, 1);
+ fSrcToDst[10] = srcToDst.getFloat(2, 2);
+ fSrcToDst[11] = 0.0f;
+ fSrcToDst[12] = srcToDst.getFloat(3, 0);
+ fSrcToDst[13] = srcToDst.getFloat(3, 1);
+ fSrcToDst[14] = srcToDst.getFloat(3, 2);
+ fSrcToDst[15] = 0.0f;
+}
+
+void Sk2Dot2Xform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const {
+ SkOpts::color_xform_2Dot2_RGBA_to_8888(dst, src, len, fSrcToDst);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
static inline float byte_to_float(uint8_t v) {
return ((float) v) * (1.0f / 255.0f);
}
-static inline uint8_t clamp_float_to_byte(float v) {
+// Expand range from 0-1 to 0-255, then convert.
+static inline uint8_t clamp_normalized_float_to_byte(float v) {
v = v * 255.0f;
- if (v > 255.0f) {
+ if (v >= 254.5f) {
return 255;
- } else if (v <= 0.0f) {
+ } else if (v < 0.5f) {
return 0;
} else {
return (uint8_t) (v + 0.5f);
}
}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-SkGammaByValueXform::SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst,
- float dstGammas[3])
- : fSrcToDst(srcToDst)
-{
- memcpy(fSrcGammas, srcGammas, 3 * sizeof(float));
- memcpy(fDstGammas, dstGammas, 3 * sizeof(float));
-}
-
-void SkGammaByValueXform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const {
- while (len-- > 0) {
- float srcFloats[3];
- srcFloats[0] = byte_to_float((*src >> 0) & 0xFF);
- srcFloats[1] = byte_to_float((*src >> 8) & 0xFF);
- srcFloats[2] = byte_to_float((*src >> 16) & 0xFF);
-
- // Convert to linear.
- srcFloats[0] = pow(srcFloats[0], fSrcGammas[0]);
- srcFloats[1] = pow(srcFloats[1], fSrcGammas[1]);
- srcFloats[2] = pow(srcFloats[2], fSrcGammas[2]);
-
- // Convert to dst gamut.
- float dstFloats[3];
- dstFloats[0] = srcFloats[0] * fSrcToDst.getFloat(0, 0) +
- srcFloats[1] * fSrcToDst.getFloat(1, 0) +
- srcFloats[2] * fSrcToDst.getFloat(2, 0) + fSrcToDst.getFloat(3, 0);
- dstFloats[1] = srcFloats[0] * fSrcToDst.getFloat(0, 1) +
- srcFloats[1] * fSrcToDst.getFloat(1, 1) +
- srcFloats[2] * fSrcToDst.getFloat(2, 1) + fSrcToDst.getFloat(3, 1);
- dstFloats[2] = srcFloats[0] * fSrcToDst.getFloat(0, 2) +
- srcFloats[1] * fSrcToDst.getFloat(1, 2) +
- srcFloats[2] * fSrcToDst.getFloat(2, 2) + fSrcToDst.getFloat(3, 2);
-
- // Convert to dst gamma.
- dstFloats[0] = pow(dstFloats[0], fDstGammas[0]);
- dstFloats[1] = pow(dstFloats[1], fDstGammas[1]);
- dstFloats[2] = pow(dstFloats[2], fDstGammas[2]);
-
- *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
- clamp_float_to_byte(dstFloats[0]),
- clamp_float_to_byte(dstFloats[1]),
- clamp_float_to_byte(dstFloats[2]));
-
- dst++;
- src++;
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
// Interpolating lookup in a variably sized table.
static inline float interp_lut(uint8_t byte, float* table, size_t tableSize) {
float index = byte_to_float(byte) * (tableSize - 1);
@@ -261,9 +235,9 @@ void SkDefaultXform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_
}
*dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
- clamp_float_to_byte(dstFloats[0]),
- clamp_float_to_byte(dstFloats[1]),
- clamp_float_to_byte(dstFloats[2]));
+ clamp_normalized_float_to_byte(dstFloats[0]),
+ clamp_normalized_float_to_byte(dstFloats[1]),
+ clamp_normalized_float_to_byte(dstFloats[2]));
dst++;
src++;
diff --git a/src/core/SkColorSpaceXform.h b/src/core/SkColorSpaceXform.h
index d54d1b812d..3472643519 100644
--- a/src/core/SkColorSpaceXform.h
+++ b/src/core/SkColorSpaceXform.h
@@ -34,17 +34,15 @@ public:
virtual ~SkColorSpaceXform() {}
};
-class SkGammaByValueXform : public SkColorSpaceXform {
+class Sk2Dot2Xform : public SkColorSpaceXform {
public:
void xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const override;
private:
- SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst, float dstGammas[3]);
+ Sk2Dot2Xform(const SkMatrix44& srcToDst);
- float fSrcGammas[3];
- const SkMatrix44 fSrcToDst;
- float fDstGammas[3];
+ float fSrcToDst[16];
friend class SkColorSpaceXform;
};
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 6d3e6c9fea..8dec3fad0f 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -16,6 +16,7 @@
#include "SkBlitRow_opts.h"
#include "SkBlurImageFilter_opts.h"
#include "SkColorCubeFilter_opts.h"
+#include "SkColorXform_opts.h"
#include "SkMorphologyImageFilter_opts.h"
#include "SkSwizzler_opts.h"
#include "SkTextureCompressor_opts.h"
@@ -76,6 +77,9 @@ namespace SkOpts {
decltype(srcover_srgb_srgb) srcover_srgb_srgb = sk_default::srcover_srgb_srgb;
+ decltype(color_xform_2Dot2_RGBA_to_8888) color_xform_2Dot2_RGBA_to_8888 =
+ sk_default::color_xform_2Dot2_RGBA_to_8888;
+
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
void Init_sse41();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index b8aea4aa34..0711471fb4 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -68,6 +68,11 @@ namespace SkOpts {
// Blend ndst src pixels over dst, where both src and dst point to sRGB pixels (RGBA or BGRA).
// If nsrc < ndst, we loop over src to create a pattern.
extern void (*srcover_srgb_srgb)(uint32_t* dst, const uint32_t* src, int ndst, int nsrc);
+
+ // Color xform RGBA input into SkPMColor ordered 8888 pixels. Does not premultiply, and
+ // assumes src and dst gamma curves are both 2.2f exponentials.
+ extern void (*color_xform_2Dot2_RGBA_to_8888)(uint32_t* dst, const uint32_t* src, int len,
+ const float srcToDstMatrix[16]);
}
#endif//SkOpts_DEFINED
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
new file mode 100644
index 0000000000..3fc620f014
--- /dev/null
+++ b/src/opts/SkColorXform_opts.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColorXform_opts_DEFINED
+#define SkColorXform_opts_DEFINED
+
+#include "SkColorPriv.h"
+
+namespace SK_OPTS_NS {
+
+static uint8_t clamp_float_to_byte(float v) {
+ if (v >= 254.5f) {
+ return 255;
+ } else if (v < 0.5f) {
+ return 0;
+ } else {
+ return (uint8_t) (v + 0.5f);
+ }
+}
+
+static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_t* src, int len,
+ const float matrix[16]) {
+ while (len-- > 0) {
+ float srcFloats[3];
+ srcFloats[0] = (float) ((*src >> 0) & 0xFF);
+ srcFloats[1] = (float) ((*src >> 8) & 0xFF);
+ srcFloats[2] = (float) ((*src >> 16) & 0xFF);
+
+ // Convert to linear.
+ // TODO (msarett):
+ // We should use X^2.2 here instead of X^2. What is the impact on correctness?
+ // We should be able to get closer to 2.2 at a small performance cost.
+ srcFloats[0] = srcFloats[0] * srcFloats[0];
+ srcFloats[1] = srcFloats[1] * srcFloats[1];
+ srcFloats[2] = srcFloats[2] * srcFloats[2];
+
+ // Convert to dst gamut.
+ float dstFloats[3];
+ // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost always zero.
+ // Should we have another optimized path that avoids the extra addition when they
+ // are zero?
+ dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
+ srcFloats[2] * matrix[8] + matrix[12];
+ dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
+ srcFloats[2] * matrix[9] + matrix[13];
+ dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
+ srcFloats[2] * matrix[10] + matrix[14];
+
+ // Convert to dst gamma.
+ // TODO (msarett):
+ // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?
+ // We should be able to get closer to (1/2.2) at a small performance cost.
+ dstFloats[0] = sqrtf(dstFloats[0]);
+ dstFloats[1] = sqrtf(dstFloats[1]);
+ dstFloats[2] = sqrtf(dstFloats[2]);
+
+ *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
+ clamp_float_to_byte(dstFloats[0]),
+ clamp_float_to_byte(dstFloats[1]),
+ clamp_float_to_byte(dstFloats[2]));
+
+ dst++;
+ src++;
+ }
+}
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,
+ const float matrix[16]) {
+ // Load transformation matrix.
+ __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
+ __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
+ __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
+ __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
+
+ while (len >= 4) {
+ // Load 4 pixels and convert them to floats.
+ __m128i rgba = _mm_loadu_si128((const __m128i*) src);
+ __m128i byteMask = _mm_set1_epi32(0xFF);
+ __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));
+ __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));
+ __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));
+
+ // Convert to linear.
+ // FIXME (msarett):
+ // Should we be more accurate?
+ reds = _mm_mul_ps(reds, reds);
+ greens = _mm_mul_ps(greens, greens);
+ blues = _mm_mul_ps(blues, blues);
+
+ // Apply the transformation matrix to dst gamut.
+ // FIXME (msarett):
+ // rQ, gQ, and bQ are almost always zero. Can we save a couple instructions?
+
+ // Splat rX, rY, rZ, and rQ each across a register.
+ __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
+ __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
+ __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
+ __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00);
+
+ // dstReds = rX * reds + rY * greens + rZ * blues + rQ
+ __m128 dstReds = _mm_mul_ps(reds, rX);
+ dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
+ dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
+ dstReds = _mm_add_ps(dstReds, rQ);
+
+ // Splat gX, gY, gZ, and gQ each across a register.
+ __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
+ __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
+ __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
+ __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);
+
+ // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
+ __m128 dstGreens = _mm_mul_ps(reds, gX);
+ dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
+ dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
+ dstGreens = _mm_add_ps(dstGreens, gQ);
+
+ // Splat bX, bY, bZ, and bQ each across a register.
+ __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
+ __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
+ __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
+ __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);
+
+ // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
+ __m128 dstBlues = _mm_mul_ps(reds, bX);
+ dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
+ dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
+ dstBlues = _mm_add_ps(dstBlues, bQ);
+
+ // Convert to dst gamma.
+ // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt.
+ // FIXME (msarett):
+ // Should we be more accurate?
+ dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));
+ dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));
+ dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));
+
+ // Clamp floats to 0-255 range.
+ dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ps(255.0f)));
+ dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ps(255.0f)));
+ dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ps(255.0f)));
+
+ // Convert to bytes and store to memory.
+ rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba);
+#ifdef SK_PMCOLOR_IS_RGBA
+ rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8));
+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16));
+#else
+ rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );
+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8));
+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16));
+#endif
+ _mm_storeu_si128((__m128i*) dst, rgba);
+
+ dst += 4;
+ src += 4;
+ len -= 4;
+ }
+
+ color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
+}
+
+#else
+
+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,
+ const float matrix[16]) {
+ color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
+}
+
+#endif
+
+}
+
+#endif // SkColorXform_opts_DEFINED