SkRasterPipeline: memcpy-free tail code.

We don't call the tail code nearly as often as the body code, but when we do and call memcpy(), we first have to vzeroupper back into the non-AVX world. That does seem to slow things down considerably. You wouldn't think it, but this gives a nice speed up (tested on Windows). BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3783 Change-Id: I40cbe1e529f2431825edec7638265601b64e7ec5 Reviewed-on: https://skia-review.googlesource.com/3783 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2016-10-20 16:20:46 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2016-10-20 21:33:01 +0000
commit: 050ffa9ad5d2bafc935c0a48ce3caed47446be12 (patch)
tree: 551f12d226aee77e2f95ef67968f9371e613ff21
parent: 958788ab461bf84e8c604a17dba5ebc2d6c27dfa (diff)
1 files changed, 53 insertions, 13 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 7e884c7cd3..f266433cd3 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -158,10 +158,18 @@ namespace SK_OPTS_NS {
     template <bool kIsTail, typename T>
     SI SkNx<N,T> load(size_t tail, const T* src) {
         SkASSERT(kIsTail == (tail > 0));
-        // TODO: better tail, maskload for 32- and 64-bit T
-        T buf[N] = {0};
+        // TODO: maskload for 32- and 64-bit T
+        T buf[8];
         if (kIsTail) {
-            memcpy(buf, src, tail*sizeof(T));
+            switch (tail & (N-1)) {
+                case 7: buf[6] = src[6];
+                case 6: buf[5] = src[5];
+                case 5: buf[4] = src[4];
+                case 4: buf[3] = src[3];
+                case 3: buf[2] = src[2];
+                case 2: buf[1] = src[1];
+            }
+            buf[0] = src[0];
             src = buf;
         }
         return SkNx<N,T>::Load(src);
@@ -170,12 +178,20 @@ namespace SK_OPTS_NS {
     template <bool kIsTail, typename T>
     SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
         SkASSERT(kIsTail == (tail > 0));
-        // TODO: better tail, maskstore for 32- and 64-bit T
-        T buf[N] = {0};
-        v.store(kIsTail ? buf : dst);
+        // TODO: maskstore for 32- and 64-bit T
         if (kIsTail) {
-            memcpy(dst, buf, tail*sizeof(T));
+            switch (tail & (N-1)) {
+                case 7: dst[6] = v[6];
+                case 6: dst[5] = v[5];
+                case 5: dst[4] = v[4];
+                case 4: dst[3] = v[3];
+                case 3: dst[2] = v[2];
+                case 2: dst[1] = v[1];
+            }
+            dst[0] = v[0];
+            return;
         }
+        v.store(dst);
     }
 
     SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) {
@@ -275,9 +291,17 @@ namespace SK_OPTS_NS {
     STAGE(load_d_f16, true) {
         auto ptr = (const uint64_t*)ctx + x;
 
-        uint64_t buf[N] = {0};
+        uint64_t buf[8];
         if (kIsTail) {
-            memcpy(buf, ptr, tail*sizeof(uint64_t));
+            switch (tail & (N-1)) {
+                case 7: buf[6] = ptr[6];
+                case 6: buf[5] = ptr[5];
+                case 5: buf[4] = ptr[4];
+                case 4: buf[3] = ptr[3];
+                case 3: buf[2] = ptr[2];
+                case 2: buf[1] = ptr[1];
+            }
+            buf[0] = ptr[0];
             ptr = buf;
         }
 
@@ -292,9 +316,17 @@ namespace SK_OPTS_NS {
     STAGE(load_s_f16, true) {
         auto ptr = (const uint64_t*)ctx + x;
 
-        uint64_t buf[N] = {0};
+        uint64_t buf[8];
         if (kIsTail) {
-            memcpy(buf, ptr, tail*sizeof(uint64_t));
+            switch (tail & (N-1)) {
+                case 7: buf[6] = ptr[6];
+                case 6: buf[5] = ptr[5];
+                case 5: buf[4] = ptr[4];
+                case 4: buf[3] = ptr[3];
+                case 3: buf[2] = ptr[2];
+                case 2: buf[1] = ptr[1];
+            }
+            buf[0] = ptr[0];
             ptr = buf;
         }
 
@@ -310,13 +342,21 @@ namespace SK_OPTS_NS {
         clamp_01_premul(r,g,b,a);
         auto ptr = (uint64_t*)ctx + x;
 
-        uint64_t buf[N] = {0};
+        uint64_t buf[8];
         SkNh::Store4(kIsTail ? buf : ptr, SkFloatToHalf_finite_ftz(r),
                                           SkFloatToHalf_finite_ftz(g),
                                           SkFloatToHalf_finite_ftz(b),
                                           SkFloatToHalf_finite_ftz(a));
         if (kIsTail) {
-            memcpy(ptr, buf, tail*sizeof(uint64_t));
+            switch (tail & (N-1)) {
+                case 7: ptr[6] = buf[6];
+                case 6: ptr[5] = buf[5];
+                case 5: ptr[4] = buf[4];
+                case 4: ptr[3] = buf[3];
+                case 3: ptr[2] = buf[2];
+                case 2: ptr[1] = buf[1];
+            }
+            ptr[0] = buf[0];
         }
     }
author	Mike Klein <mtklein@chromium.org>	2016-10-20 16:20:46 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2016-10-20 21:33:01 +0000
commit	050ffa9ad5d2bafc935c0a48ce3caed47446be12 (patch)
tree	551f12d226aee77e2f95ef67968f9371e613ff21
parent	958788ab461bf84e8c604a17dba5ebc2d6c27dfa (diff)