aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2016-11-04 13:20:07 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-11-04 13:20:07 -0700
commita4a4488a4c3f16758f7e2b050168fe8d2f3b2a4d (patch)
tree207759030d7ac86a60c6cfba1d4a95de15cec5c0 /src/opts
parentd8db392be9dd1887df04b10b5670991d6b098c17 (diff)
skrpb: evaluate color filters for constant shaders once.
The simplest thing to do here is just run shader+color filter pipeline at construction time to create a new constant color shader (replacing the paint color). This reduces a pipeline like: - constant_color (paint color) - matrix_4x5 - clamp_a - load_d_foo, xfermode, lerp, store_foo to - constant_color (paint color -> matrix_4x5 -> clamp_a) - load_d_foo, xfermode, lerp, store_foo To implement this all, we add a new store_f32 stage that writes SkPM4f, and finally get around to implementing Sk8f::Store4() (store while reinterlacing). Sk4f::Store4() already exists for both SSE and NEON. Next step: reduce simple constant_color -> store pipelines (src mode, full coverage) into non-pipeline memsets. GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2480823002 Review-Url: https://codereview.chromium.org/2480823002
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkNx_sse.h31
-rw-r--r--src/opts/SkRasterPipeline_opts.h24
2 files changed, 52 insertions, 3 deletions
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index a4594115e0..a4783c6302 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -544,6 +544,14 @@ public:
__m256i fVec;
};
+ // _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d.
+ AI static __m256 unpacklo_pd(__m256 x, __m256 y) {
+ return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_castps_pd(y)));
+ }
+ AI static __m256 unpackhi_pd(__m256 x, __m256 y) {
+ return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_castps_pd(y)));
+ }
+
template <>
class SkNx<8, float> {
public:
@@ -560,6 +568,29 @@ public:
AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const float*)ptr); }
AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); }
+ AI static void Store4(void* ptr,
+ const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
+ __m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec), // r0 g0 r1 g1 | r4 g4 r5 g5
+ rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec), // r2 ... | r6 ...
+ ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec), // b0 a0 b1 a1 | b4 a4 b5 a5
+ ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec); // b2 ... | b6 ...
+
+ __m256 _04 = unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
+ _15 = unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
+ _26 = unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
+ _37 = unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
+
+ __m256 _01 = _mm256_permute2f128_ps(_04, _15, 16), // 16 == 010 000 == lo, lo
+ _23 = _mm256_permute2f128_ps(_26, _37, 16),
+ _45 = _mm256_permute2f128_ps(_04, _15, 25), // 25 == 011 001 == hi, hi
+ _67 = _mm256_permute2f128_ps(_26, _37, 25);
+
+ _mm256_storeu_ps((float*)ptr + 0*8, _01);
+ _mm256_storeu_ps((float*)ptr + 1*8, _23);
+ _mm256_storeu_ps((float*)ptr + 2*8, _45);
+ _mm256_storeu_ps((float*)ptr + 3*8, _67);
+ }
+
AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); }
AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); }
AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); }
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index bd42632b4b..155558e776 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -23,9 +23,9 @@ namespace {
static constexpr int N = 4;
#endif
-using SkNf = SkNx<N, float>;
-using SkNi = SkNx<N, int>;
-using SkNh = SkNx<N, uint16_t>;
+ using SkNf = SkNx<N, float>;
+ using SkNi = SkNx<N, int>;
+ using SkNh = SkNx<N, uint16_t>;
struct BodyStage;
struct TailStage;
@@ -379,6 +379,24 @@ STAGE(store_f16, false) {
}
}
+STAGE(store_f32, false) {
+ auto ptr = *(SkPM4f**)ctx + x;
+
+ SkPM4f buf[8];
+ SkNf::Store4(kIsTail ? buf : ptr, r,g,b,a);
+ if (kIsTail) {
+ switch (tail & (N-1)) {
+ case 7: ptr[6] = buf[6];
+ case 6: ptr[5] = buf[5];
+ case 5: ptr[4] = buf[4];
+ case 4: ptr[3] = buf[3];
+ case 3: ptr[2] = buf[2];
+ case 2: ptr[1] = buf[1];
+ }
+ ptr[0] = buf[0];
+ }
+}
+
// Load 8-bit SkPMColor-order sRGB.
STAGE(load_d_srgb, true) {