aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar msarett <msarett@google.com>2016-06-22 14:55:51 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-06-22 14:55:51 -0700
commit9bba21530d7566494533788f4934848ea9318080 (patch)
tree08402cfe388091611acebde0da70859ada4202f4
parentb39067696ad08a26bbe49b71a71f0546dc42a075 (diff)
Do loads and math in parallel in SkColorXform_opts
Note that baselines have changed a little since I recently started using clang. 201295.jpg on HP z620 (300x280) Skia Xform sRGB Dst Before 0.378 ms Skia Xform sRGB Dst After 0.322 ms 1.17x Skia Xform 2.2 Dst Before 0.428 ms Skia Xform 2.2 Dst After 0.395 ms 1.08x QCMS Xform 0.418 ms sRGB Dst vs QCMS 1.30x 2.2 Dst vs QCMS 1.06x -------------------------------------------- Nexus 6P: Skia Xform sRGB Dst Before 1.58 ms Skia Xform sRGB Dst After 1.43 ms Skia Xform 2.2 Dst Before 2.69 ms Skia Xform 2.2 Dst After 2.62 ms Dell Venue 8: Skia Xform sRGB Dst Before 2.78 ms Skia Xform sRGB Dst After 2.74 ms Skia Xform 2.2 Dst Before 3.73 ms Skia Xform 2.2 Dst After 3.64 ms BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2081933005 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2081933005
-rw-r--r--src/opts/SkColorXform_opts.h97
1 files changed, 55 insertions, 42 deletions
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
index b179311e36..74aa53ce7a 100644
--- a/src/opts/SkColorXform_opts.h
+++ b/src/opts/SkColorXform_opts.h
@@ -55,51 +55,64 @@ static Sk4f clamp_0_to_255(const Sk4f& x) {
template <const float (&linear_from_curve)[256], Sk4f (*linear_to_curve)(const Sk4f&)>
static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
const float matrix[16]) {
- // Load transformation matrix.
- auto rXgXbX = Sk4f::Load(matrix + 0),
+ Sk4f rXgXbX = Sk4f::Load(matrix + 0),
rYgYbY = Sk4f::Load(matrix + 4),
rZgZbZ = Sk4f::Load(matrix + 8);
- while (len >= 4) {
- // Convert to linear. The look-up table has perfect accuracy.
- auto reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF],
- linear_from_curve[(src[1] >> 0) & 0xFF],
- linear_from_curve[(src[2] >> 0) & 0xFF],
- linear_from_curve[(src[3] >> 0) & 0xFF]};
- auto greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF],
- linear_from_curve[(src[1] >> 8) & 0xFF],
- linear_from_curve[(src[2] >> 8) & 0xFF],
- linear_from_curve[(src[3] >> 8) & 0xFF]};
- auto blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
- linear_from_curve[(src[1] >> 16) & 0xFF],
- linear_from_curve[(src[2] >> 16) & 0xFF],
- linear_from_curve[(src[3] >> 16) & 0xFF]};
-
- // Apply the transformation matrix to dst gamut.
- auto dstReds = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues,
- dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues,
- dstBlues = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
-
- // Convert to dst gamma.
- dstReds = linear_to_curve(dstReds);
- dstGreens = linear_to_curve(dstGreens);
- dstBlues = linear_to_curve(dstBlues);
-
- // Clamp floats to byte range.
- dstReds = clamp_0_to_255(dstReds);
- dstGreens = clamp_0_to_255(dstGreens);
- dstBlues = clamp_0_to_255(dstBlues);
-
- // Convert to bytes and store to memory.
- auto rgba = (Sk4i{(int)0xFF000000} )
- | (SkNx_cast<int>(dstReds) )
- | (SkNx_cast<int>(dstGreens) << 8)
- | (SkNx_cast<int>(dstBlues) << 16);
- rgba.store(dst);
-
- dst += 4;
- src += 4;
- len -= 4;
+ if (len >= 4) {
+ Sk4f reds, greens, blues;
+ auto load_next_4 = [&reds, &greens, &blues, &src, &len] {
+ reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF],
+ linear_from_curve[(src[1] >> 0) & 0xFF],
+ linear_from_curve[(src[2] >> 0) & 0xFF],
+ linear_from_curve[(src[3] >> 0) & 0xFF]};
+ greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF],
+ linear_from_curve[(src[1] >> 8) & 0xFF],
+ linear_from_curve[(src[2] >> 8) & 0xFF],
+ linear_from_curve[(src[3] >> 8) & 0xFF]};
+ blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
+ linear_from_curve[(src[1] >> 16) & 0xFF],
+ linear_from_curve[(src[2] >> 16) & 0xFF],
+ linear_from_curve[(src[3] >> 16) & 0xFF]};
+ src += 4;
+ len -= 4;
+ };
+
+ Sk4f dstReds, dstGreens, dstBlues;
+ auto transform_4 = [&reds, &greens, &blues, &dstReds, &dstGreens, &dstBlues, &rXgXbX,
+ &rYgYbY, &rZgZbZ] {
+ dstReds = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues;
+ dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues;
+ dstBlues = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
+ };
+
+ auto store_4 = [&dstReds, &dstGreens, &dstBlues, &dst] {
+ dstReds = linear_to_curve(dstReds);
+ dstGreens = linear_to_curve(dstGreens);
+ dstBlues = linear_to_curve(dstBlues);
+
+ dstReds = clamp_0_to_255(dstReds);
+ dstGreens = clamp_0_to_255(dstGreens);
+ dstBlues = clamp_0_to_255(dstBlues);
+
+ auto rgba = (Sk4i{(int)0xFF000000} )
+ | (SkNx_cast<int>(dstReds) )
+ | (SkNx_cast<int>(dstGreens) << 8)
+ | (SkNx_cast<int>(dstBlues) << 16);
+ rgba.store(dst);
+ dst += 4;
+ };
+
+ load_next_4();
+
+ while (len >= 4) {
+ transform_4();
+ load_next_4();
+ store_4();
+ }
+
+ transform_4();
+ store_4();
}
while (len > 0) {