Port SkMatrix opts to SkOpts.

No changes to the code, just moved around. This will have the effect of enabling vectorized code on ARMv7. Should be no effect on ARMv8 or x86, which would have been vectorized already. nanobench --match mappoints changes on Nexus 5 (ARMv7): _affine: 132 -> 95 _scale: 118 -> 47 _trans: 60 -> 37 A teaser: We should next look at the ABCD->BADC shuffle we've noted that we need in _affine. A quick hack showed doing that optimally is another ~35% speedup on x86. Got to figure out how to do it best on ARM though: that same quick hack was a 2x slowdown there. Good reason to resurrect that SkNx_shuffle() CL! (I believe the answers are vrev64q_f32(v) and _mm_shuffle_ps(v,v, _MM_SHUFFLE(2,3,0,1), but we should probably find out in another CL.) BUG=skia:4117 Review URL: https://codereview.chromium.org/1320673014
author: mtklein <mtklein@chromium.org> 2015-09-10 11:18:31 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-09-10 11:18:31 -0700
commit: 4e8a09d3672702704436112f3aa5611bd79b2690 (patch)
tree: 30194ee6e5ecb9f9e18526619c1da694af83ef74 /src/opts
parent: b3b9aec2215914c0e626d90db21b68b970526c2e (diff)
2 files changed, 111 insertions, 0 deletions
diff --git a/src/opts/SkMatrix_opts.h b/src/opts/SkMatrix_opts.h
new file mode 100644
index 0000000000..3fb2701e88
--- /dev/null
+++ b/src/opts/SkMatrix_opts.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkMatrix_opts_DEFINED
+#define SkMatrix_opts_DEFINED
+
+#include "SkMatrix.h"
+#include "SkNx.h"
+
+namespace SK_OPTS_NS {
+
+static void matrix_translate(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
+    SkASSERT(m.getType() <= SkMatrix::kTranslate_Mask);
+    if (count > 0) {
+        SkScalar tx = m.getTranslateX();
+        SkScalar ty = m.getTranslateY();
+        if (count & 1) {
+            dst->fX = src->fX + tx;
+            dst->fY = src->fY + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4s trans4(tx, ty, tx, ty);
+        count >>= 1;
+        if (count & 1) {
+            (Sk4s::Load(&src->fX) + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4s::Load(&src[0].fX) + trans4).store(&dst[0].fX);
+            (Sk4s::Load(&src[2].fX) + trans4).store(&dst[2].fX);
+            src += 4;
+            dst += 4;
+        }
+    }
+}
+
+static void matrix_scale_translate(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
+    SkASSERT(m.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask));
+    if (count > 0) {
+        SkScalar tx = m.getTranslateX();
+        SkScalar ty = m.getTranslateY();
+        SkScalar sx = m.getScaleX();
+        SkScalar sy = m.getScaleY();
+        if (count & 1) {
+            dst->fX = src->fX * sx + tx;
+            dst->fY = src->fY * sy + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4s trans4(tx, ty, tx, ty);
+        Sk4s scale4(sx, sy, sx, sy);
+        count >>= 1;
+        if (count & 1) {
+            (Sk4s::Load(&src->fX) * scale4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4s::Load(&src[0].fX) * scale4 + trans4).store(&dst[0].fX);
+            (Sk4s::Load(&src[2].fX) * scale4 + trans4).store(&dst[2].fX);
+            src += 4;
+            dst += 4;
+        }
+    }
+}
+
+static void matrix_affine(const SkMatrix& m, SkPoint* dst, const SkPoint* src, int count) {
+    SkASSERT(m.getType() != SkMatrix::kPerspective_Mask);
+    if (count > 0) {
+        SkScalar tx = m.getTranslateX();
+        SkScalar ty = m.getTranslateY();
+        SkScalar sx = m.getScaleX();
+        SkScalar sy = m.getScaleY();
+        SkScalar kx = m.getSkewX();
+        SkScalar ky = m.getSkewY();
+        if (count & 1) {
+            dst->set(src->fX * sx + src->fY * kx + tx,
+                     src->fX * ky + src->fY * sy + ty);
+            src += 1;
+            dst += 1;
+        }
+        Sk4s trans4(tx, ty, tx, ty);
+        Sk4s scale4(sx, sy, sx, sy);
+        Sk4s  skew4(kx, ky, kx, ky);    // applied to swizzle of src4
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            Sk4s src4 = Sk4s::Load(&src->fX);
+            Sk4s swz4(src[0].fY, src[0].fX, src[1].fY, src[1].fX);  // need ABCD -> BADC
+            (src4 * scale4 + swz4 * skew4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+    }
+}
+
+} // namespace SK_OPTS_NS
+
+#endif//SkMatrix_opts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index 802d83f69c..a0388b0654 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -13,6 +13,7 @@
 #include "SkBlurImageFilter_opts.h"
 #include "SkColorCubeFilter_opts.h"
 #include "SkFloatingPoint_opts.h"
+#include "SkMatrix_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkTextureCompressor_opts.h"
 #include "SkUtils_opts.h"
@@ -42,5 +43,9 @@ namespace SkOpts {
         blit_row_color32 = sk_neon::blit_row_color32;
 
         color_cube_filter_span = sk_neon::color_cube_filter_span;
+
+        matrix_translate       = sk_neon::matrix_translate;
+        matrix_scale_translate = sk_neon::matrix_scale_translate;
+        matrix_affine          = sk_neon::matrix_affine;
     }
 }
author	mtklein <mtklein@chromium.org>	2015-09-10 11:18:31 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-09-10 11:18:31 -0700
commit	4e8a09d3672702704436112f3aa5611bd79b2690 (patch)
tree	30194ee6e5ecb9f9e18526619c1da694af83ef74 /src/opts
parent	b3b9aec2215914c0e626d90db21b68b970526c2e (diff)