Revert "Revert "8-bit jumper on armv8""

This reverts commit 6d13575108299951ecdfba6d85c915fcec2bc028. Now with guards for "errors" like this: external/skia/src/jumper/SkJumper_stages_8bit.cpp:240:50: error: 'memcpy' called with size bigger than buffer case 12: memcpy(&v, src, 12*sizeof(T)); break; This code is unreachable and generally removed by Clang's optimizer anyway... as far as I can tell the code generation diff is arbitrary. Change-Id: I6216567caaa6166f71258bd25343a09e93892a10 Reviewed-on: https://skia-review.googlesource.com/39961 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-08-29 08:40:48 -0400
committer: Mike Klein <mtklein@chromium.org> 2017-08-29 17:04:47 +0000
commit: 9d7e57d509149dd2fcb3ba73ea8f4cdce11f84bd (patch)
tree: 5442beb60c037b62ebc9477742d6490fb6dcac20 /src/jumper
parent: 6d13575108299951ecdfba6d85c915fcec2bc028 (diff)
4 files changed, 118 insertions, 75 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 9f8e970f32..315110faf2 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);
 extern "C" {
 
 #if __has_feature(memory_sanitizer)
-    // We'll just run portable code.
+    // We'll just run baseline code.
 
 #elif defined(__arm__)
     StartPipelineFn ASM(start_pipeline,vfp4);
@@ -168,12 +168,22 @@ extern "C" {
 
 #endif
 
-    // Portable, single-pixel stages.
+    // Baseline code compiled as a normal part of Skia.
     StartPipelineFn sk_start_pipeline;
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
+
+#if defined(__clang__) && defined(__aarch64__)
+    // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
+    StartPipelineFn sk_start_pipeline_8bit;
+    StageFn sk_just_return_8bit;
+    #define M(st) StageFn sk_##st##_8bit;
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+#endif
+
 }
 
 #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
@@ -198,6 +208,16 @@ extern "C" {
         }
         LOWP_STAGES(M)
     #undef M
+#elif defined(__clang__) && defined(__aarch64__)
+    template <SkRasterPipeline::StockStage st>
+    static constexpr StageFn* aarch64_8bit() { return nullptr; }
+
+    #define M(st)                                                               \
+        template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() {   \
+            return sk_##st##_8bit;                                              \
+        }
+        LOWP_STAGES(M)
+    #undef M
 #endif
 
 // Engines comprise everything we need to run SkRasterPipelines.
@@ -207,20 +227,20 @@ struct SkJumper_Engine {
     StageFn*         just_return;
 };
 
-// We'll default to this portable engine, but try to choose a better one at runtime.
-static const SkJumper_Engine kPortable = {
+// We'll default to this baseline engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kBaseline = {
 #define M(stage) sk_##stage,
     { SK_RASTER_PIPELINE_STAGES(M) },
 #undef M
     sk_start_pipeline,
     sk_just_return,
 };
-static SkJumper_Engine gEngine = kPortable;
+static SkJumper_Engine gEngine = kBaseline;
 static SkOnce gChooseEngineOnce;
 
 static SkJumper_Engine choose_engine() {
 #if __has_feature(memory_sanitizer)
-    // We'll just run portable code.
+    // We'll just run baseline code.
 
 #elif defined(__arm__)
     if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
@@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() {
     }
 
 #endif
-    return kPortable;
+    return kBaseline;
 }
 
 #ifndef SK_JUMPER_DISABLE_8BIT
@@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() {
             #undef M
             };
         }
+    #elif defined(__clang__) && defined(__aarch64__)
+        return {
+        #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            sk_start_pipeline_8bit,
+            sk_just_return_8bit,
+        #undef M
+        };
     #endif
         return kNone;
     }
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index eeb3a88d77..465095b67f 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -50207,9 +50207,9 @@ _sk_load_a8_sse2_8bit:
   .byte  117,48                              // jne           28f81 <_sk_load_a8_sse2_8bit+0x4d>
   .byte  243,66,15,126,4,2                   // movq          (%rdx,%r8,1),%xmm0
   .byte  102,15,96,192                       // punpcklbw     %xmm0,%xmm0
-  .byte  102,15,84,5,109,51,0,0              // andpd         0x336d(%rip),%xmm0        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,5,109,51,0,0             // pand          0x336d(%rip),%xmm0        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,239,228                      // pxor          %xmm4,%xmm4
-  .byte  102,15,40,200                       // movapd        %xmm0,%xmm1
+  .byte  102,15,111,200                      // movdqa        %xmm0,%xmm1
   .byte  102,15,105,204                      // punpckhwd     %xmm4,%xmm1
   .byte  102,15,97,196                       // punpcklwd     %xmm4,%xmm0
   .byte  102,15,114,240,24                   // pslld         $0x18,%xmm0
@@ -50284,9 +50284,9 @@ _sk_load_a8_dst_sse2_8bit:
   .byte  117,48                              // jne           29075 <_sk_load_a8_dst_sse2_8bit+0x4d>
   .byte  243,66,15,126,20,2                  // movq          (%rdx,%r8,1),%xmm2
   .byte  102,15,96,208                       // punpcklbw     %xmm0,%xmm2
-  .byte  102,15,84,21,121,50,0,0             // andpd         0x3279(%rip),%xmm2        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,21,121,50,0,0            // pand          0x3279(%rip),%xmm2        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,239,228                      // pxor          %xmm4,%xmm4
-  .byte  102,15,40,218                       // movapd        %xmm2,%xmm3
+  .byte  102,15,111,218                      // movdqa        %xmm2,%xmm3
   .byte  102,15,105,220                      // punpckhwd     %xmm4,%xmm3
   .byte  102,15,97,212                       // punpcklwd     %xmm4,%xmm2
   .byte  102,15,114,242,24                   // pslld         $0x18,%xmm2
@@ -50382,26 +50382,26 @@ _sk_store_a8_sse2_8bit:
   .byte  72,99,4,129                         // movslq        (%rcx,%rax,4),%rax
   .byte  72,1,200                            // add           %rcx,%rax
   .byte  255,224                             // jmpq          *%rax
-  .byte  102,15,127,100,36,168               // movdqa        %xmm4,-0x58(%rsp)
-  .byte  138,68,36,168                       // mov           -0x58(%rsp),%al
+  .byte  102,15,127,100,36,232               // movdqa        %xmm4,-0x18(%rsp)
+  .byte  138,68,36,232                       // mov           -0x18(%rsp),%al
   .byte  66,136,4,2                          // mov           %al,(%rdx,%r8,1)
   .byte  235,203                             // jmp           29175 <_sk_store_a8_sse2_8bit+0x59>
-  .byte  102,15,127,100,36,184               // movdqa        %xmm4,-0x48(%rsp)
-  .byte  138,68,36,188                       // mov           -0x44(%rsp),%al
+  .byte  102,15,127,100,36,216               // movdqa        %xmm4,-0x28(%rsp)
+  .byte  138,68,36,220                       // mov           -0x24(%rsp),%al
   .byte  66,136,68,2,2                       // mov           %al,0x2(%rdx,%r8,1)
   .byte  102,15,219,37,15,49,0,0             // pand          0x310f(%rip),%xmm4        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,103,228                      // packuswb      %xmm4,%xmm4
   .byte  102,15,126,224                      // movd          %xmm4,%eax
   .byte  102,66,137,4,2                      // mov           %ax,(%rdx,%r8,1)
   .byte  235,165                             // jmp           29175 <_sk_store_a8_sse2_8bit+0x59>
-  .byte  102,15,127,100,36,232               // movdqa        %xmm4,-0x18(%rsp)
-  .byte  138,68,36,244                       // mov           -0xc(%rsp),%al
+  .byte  102,15,127,100,36,200               // movdqa        %xmm4,-0x38(%rsp)
+  .byte  138,68,36,212                       // mov           -0x2c(%rsp),%al
   .byte  66,136,68,2,6                       // mov           %al,0x6(%rdx,%r8,1)
-  .byte  102,15,127,100,36,216               // movdqa        %xmm4,-0x28(%rsp)
-  .byte  138,68,36,226                       // mov           -0x1e(%rsp),%al
+  .byte  102,15,127,100,36,184               // movdqa        %xmm4,-0x48(%rsp)
+  .byte  138,68,36,194                       // mov           -0x3e(%rsp),%al
   .byte  66,136,68,2,5                       // mov           %al,0x5(%rdx,%r8,1)
-  .byte  102,15,127,100,36,200               // movdqa        %xmm4,-0x38(%rsp)
-  .byte  138,68,36,208                       // mov           -0x30(%rsp),%al
+  .byte  102,15,127,100,36,168               // movdqa        %xmm4,-0x58(%rsp)
+  .byte  138,68,36,176                       // mov           -0x50(%rsp),%al
   .byte  66,136,68,2,4                       // mov           %al,0x4(%rdx,%r8,1)
   .byte  102,15,219,37,203,48,0,0            // pand          0x30cb(%rip),%xmm4        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,103,228                      // packuswb      %xmm4,%xmm4
@@ -50440,9 +50440,9 @@ _sk_load_g8_sse2_8bit:
   .byte  117,116                             // jne           292c1 <_sk_load_g8_sse2_8bit+0x91>
   .byte  243,66,15,126,4,2                   // movq          (%rdx,%r8,1),%xmm0
   .byte  102,15,96,192                       // punpcklbw     %xmm0,%xmm0
-  .byte  102,15,84,5,113,48,0,0              // andpd         0x3071(%rip),%xmm0        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,5,113,48,0,0             // pand          0x3071(%rip),%xmm0        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,239,201                      // pxor          %xmm1,%xmm1
-  .byte  102,15,40,224                       // movapd        %xmm0,%xmm4
+  .byte  102,15,111,224                      // movdqa        %xmm0,%xmm4
   .byte  102,15,97,225                       // punpcklwd     %xmm1,%xmm4
   .byte  102,15,105,193                      // punpckhwd     %xmm1,%xmm0
   .byte  102,15,111,45,169,55,0,0            // movdqa        0x37a9(%rip),%xmm5        # 2ca20 <_sk_overlay_sse2_8bit+0x153b>
@@ -50532,9 +50532,9 @@ _sk_load_g8_dst_sse2_8bit:
   .byte  117,116                             // jne           29401 <_sk_load_g8_dst_sse2_8bit+0x91>
   .byte  243,66,15,126,20,2                  // movq          (%rdx,%r8,1),%xmm2
   .byte  102,15,96,208                       // punpcklbw     %xmm0,%xmm2
-  .byte  102,15,84,21,49,47,0,0              // andpd         0x2f31(%rip),%xmm2        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,21,49,47,0,0             // pand          0x2f31(%rip),%xmm2        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,15,239,219                      // pxor          %xmm3,%xmm3
-  .byte  102,15,40,226                       // movapd        %xmm2,%xmm4
+  .byte  102,15,111,226                      // movdqa        %xmm2,%xmm4
   .byte  102,15,97,227                       // punpcklwd     %xmm3,%xmm4
   .byte  102,15,105,211                      // punpckhwd     %xmm3,%xmm2
   .byte  102,15,111,45,105,54,0,0            // movdqa        0x3669(%rip),%xmm5        # 2ca20 <_sk_overlay_sse2_8bit+0x153b>
@@ -50815,9 +50815,9 @@ _sk_scale_u8_sse2_8bit:
   .byte  15,133,239,0,0,0                    // jne           298ad <_sk_scale_u8_sse2_8bit+0x110>
   .byte  243,66,15,126,36,2                  // movq          (%rdx,%r8,1),%xmm4
   .byte  102,15,96,224                       // punpcklbw     %xmm0,%xmm4
-  .byte  102,15,84,37,0,43,0,0               // andpd         0x2b00(%rip),%xmm4        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,37,0,43,0,0              // pand          0x2b00(%rip),%xmm4        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,69,15,239,192                   // pxor          %xmm8,%xmm8
-  .byte  102,15,40,236                       // movapd        %xmm4,%xmm5
+  .byte  102,15,111,236                      // movdqa        %xmm4,%xmm5
   .byte  102,65,15,105,232                   // punpckhwd     %xmm8,%xmm5
   .byte  102,65,15,97,224                    // punpcklwd     %xmm8,%xmm4
   .byte  102,15,114,244,24                   // pslld         $0x18,%xmm4
@@ -51005,9 +51005,9 @@ _sk_lerp_u8_sse2_8bit:
   .byte  15,133,141,1,0,0                    // jne           29c44 <_sk_lerp_u8_sse2_8bit+0x1ae>
   .byte  243,66,15,126,44,2                  // movq          (%rdx,%r8,1),%xmm5
   .byte  102,15,96,232                       // punpcklbw     %xmm0,%xmm5
-  .byte  102,15,84,45,7,40,0,0               // andpd         0x2807(%rip),%xmm5        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+  .byte  102,15,219,45,7,40,0,0              // pand          0x2807(%rip),%xmm5        # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
   .byte  102,69,15,239,192                   // pxor          %xmm8,%xmm8
-  .byte  102,15,40,229                       // movapd        %xmm5,%xmm4
+  .byte  102,15,111,229                      // movdqa        %xmm5,%xmm4
   .byte  102,65,15,105,224                   // punpckhwd     %xmm8,%xmm4
   .byte  102,65,15,97,232                    // punpcklwd     %xmm8,%xmm5
   .byte  102,15,114,245,24                   // pslld         $0x18,%xmm5
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 99ec6b9fa9..d85a0de655 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -39685,9 +39685,9 @@ _sk_load_a8_sse2_8bit LABEL PROC
   DB  117,48                              ; jne           296ad <_sk_load_a8_sse2_8bit+0x4d>
   DB  243,66,15,126,4,2                   ; movq          (%rdx,%r8,1),%xmm0
   DB  102,15,96,192                       ; punpcklbw     %xmm0,%xmm0
-  DB  102,15,84,5,193,51,0,0              ; andpd         0x33c1(%rip),%xmm0        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,5,193,51,0,0             ; pand          0x33c1(%rip),%xmm0        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,239,228                      ; pxor          %xmm4,%xmm4
-  DB  102,15,40,200                       ; movapd        %xmm0,%xmm1
+  DB  102,15,111,200                      ; movdqa        %xmm0,%xmm1
   DB  102,15,105,204                      ; punpckhwd     %xmm4,%xmm1
   DB  102,15,97,196                       ; punpcklwd     %xmm4,%xmm0
   DB  102,15,114,240,24                   ; pslld         $0x18,%xmm0
@@ -39760,9 +39760,9 @@ _sk_load_a8_dst_sse2_8bit LABEL PROC
   DB  117,48                              ; jne           297a1 <_sk_load_a8_dst_sse2_8bit+0x4d>
   DB  243,66,15,126,20,2                  ; movq          (%rdx,%r8,1),%xmm2
   DB  102,15,96,208                       ; punpcklbw     %xmm0,%xmm2
-  DB  102,15,84,21,205,50,0,0             ; andpd         0x32cd(%rip),%xmm2        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,21,205,50,0,0            ; pand          0x32cd(%rip),%xmm2        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,239,228                      ; pxor          %xmm4,%xmm4
-  DB  102,15,40,218                       ; movapd        %xmm2,%xmm3
+  DB  102,15,111,218                      ; movdqa        %xmm2,%xmm3
   DB  102,15,105,220                      ; punpckhwd     %xmm4,%xmm3
   DB  102,15,97,212                       ; punpcklwd     %xmm4,%xmm2
   DB  102,15,114,242,24                   ; pslld         $0x18,%xmm2
@@ -39858,48 +39858,46 @@ _sk_store_a8_sse2_8bit LABEL PROC
   DB  72,99,4,129                         ; movslq        (%rcx,%rax,4),%rax
   DB  72,1,200                            ; add           %rcx,%rax
   DB  255,224                             ; jmpq          *%rax
-  DB  102,15,127,36,36                    ; movdqa        %xmm4,(%rsp)
-  DB  138,4,36                            ; mov           (%rsp),%al
+  DB  102,15,127,100,36,64                ; movdqa        %xmm4,0x40(%rsp)
+  DB  138,68,36,64                        ; mov           0x40(%rsp),%al
   DB  66,136,4,2                          ; mov           %al,(%rdx,%r8,1)
-  DB  235,201                             ; jmp           298a5 <_sk_store_a8_sse2_8bit+0x5d>
-  DB  102,15,127,100,36,16                ; movdqa        %xmm4,0x10(%rsp)
-  DB  138,68,36,20                        ; mov           0x14(%rsp),%al
+  DB  235,199                             ; jmp           298a5 <_sk_store_a8_sse2_8bit+0x5d>
+  DB  102,15,127,100,36,48                ; movdqa        %xmm4,0x30(%rsp)
+  DB  138,68,36,52                        ; mov           0x34(%rsp),%al
   DB  66,136,68,2,2                       ; mov           %al,0x2(%rdx,%r8,1)
-  DB  102,15,219,37,93,49,0,0             ; pand          0x315d(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,37,91,49,0,0             ; pand          0x315b(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,103,228                      ; packuswb      %xmm4,%xmm4
   DB  102,15,126,224                      ; movd          %xmm4,%eax
   DB  102,66,137,4,2                      ; mov           %ax,(%rdx,%r8,1)
-  DB  235,163                             ; jmp           298a5 <_sk_store_a8_sse2_8bit+0x5d>
-  DB  102,15,127,100,36,64                ; movdqa        %xmm4,0x40(%rsp)
-  DB  138,68,36,76                        ; mov           0x4c(%rsp),%al
+  DB  235,161                             ; jmp           298a5 <_sk_store_a8_sse2_8bit+0x5d>
+  DB  102,15,127,100,36,32                ; movdqa        %xmm4,0x20(%rsp)
+  DB  138,68,36,44                        ; mov           0x2c(%rsp),%al
   DB  66,136,68,2,6                       ; mov           %al,0x6(%rdx,%r8,1)
-  DB  102,15,127,100,36,48                ; movdqa        %xmm4,0x30(%rsp)
-  DB  138,68,36,58                        ; mov           0x3a(%rsp),%al
+  DB  102,15,127,100,36,16                ; movdqa        %xmm4,0x10(%rsp)
+  DB  138,68,36,26                        ; mov           0x1a(%rsp),%al
   DB  66,136,68,2,5                       ; mov           %al,0x5(%rdx,%r8,1)
-  DB  102,15,127,100,36,32                ; movdqa        %xmm4,0x20(%rsp)
-  DB  138,68,36,40                        ; mov           0x28(%rsp),%al
+  DB  102,15,127,36,36                    ; movdqa        %xmm4,(%rsp)
+  DB  138,68,36,8                         ; mov           0x8(%rsp),%al
   DB  66,136,68,2,4                       ; mov           %al,0x4(%rdx,%r8,1)
-  DB  102,15,219,37,25,49,0,0             ; pand          0x3119(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,37,24,49,0,0             ; pand          0x3118(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,103,228                      ; packuswb      %xmm4,%xmm4
   DB  102,66,15,126,36,2                  ; movd          %xmm4,(%rdx,%r8,1)
-  DB  233,95,255,255,255                  ; jmpq          298a5 <_sk_store_a8_sse2_8bit+0x5d>
-  DB  102,144                             ; xchg          %ax,%ax
+  DB  233,94,255,255,255                  ; jmpq          298a5 <_sk_store_a8_sse2_8bit+0x5d>
+  DB  144                                 ; nop
   DB  134,255                             ; xchg          %bh,%bh
   DB  255                                 ; (bad)
-  DB  255,163,255,255,255,148             ; jmpq          *-0x6b000001(%rbx)
+  DB  255,165,255,255,255,150             ; jmpq          *-0x69000001(%rbp)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  255,231                             ; jmpq          *%rdi
-  DB  255                                 ; (bad)
   DB  255                                 ; (bad)
+  DB  232,255,255,255,218                 ; callq         ffffffffdb029958 <_sk_overlay_sse2_8bit+0xffffffffdaffdd0b>
   DB  255                                 ; (bad)
-  DB  216,255                             ; fdivr         %st(7),%st
   DB  255                                 ; (bad)
-  DB  255,201                             ; dec           %ecx
+  DB  255,203                             ; dec           %ebx
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  186                                 ; .byte         0xba
+  DB  188                                 ; .byte         0xbc
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; .byte         0xff
@@ -39917,9 +39915,9 @@ _sk_load_g8_sse2_8bit LABEL PROC
   DB  117,116                             ; jne           299f5 <_sk_load_g8_sse2_8bit+0x91>
   DB  243,66,15,126,4,2                   ; movq          (%rdx,%r8,1),%xmm0
   DB  102,15,96,192                       ; punpcklbw     %xmm0,%xmm0
-  DB  102,15,84,5,189,48,0,0              ; andpd         0x30bd(%rip),%xmm0        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,5,189,48,0,0             ; pand          0x30bd(%rip),%xmm0        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,239,201                      ; pxor          %xmm1,%xmm1
-  DB  102,15,40,224                       ; movapd        %xmm0,%xmm4
+  DB  102,15,111,224                      ; movdqa        %xmm0,%xmm4
   DB  102,15,97,225                       ; punpcklwd     %xmm1,%xmm4
   DB  102,15,105,193                      ; punpckhwd     %xmm1,%xmm0
   DB  102,15,111,45,245,55,0,0            ; movdqa        0x37f5(%rip),%xmm5        # 2d1a0 <_sk_overlay_sse2_8bit+0x1553>
@@ -40007,9 +40005,9 @@ _sk_load_g8_dst_sse2_8bit LABEL PROC
   DB  117,116                             ; jne           29b35 <_sk_load_g8_dst_sse2_8bit+0x91>
   DB  243,66,15,126,20,2                  ; movq          (%rdx,%r8,1),%xmm2
   DB  102,15,96,208                       ; punpcklbw     %xmm0,%xmm2
-  DB  102,15,84,21,125,47,0,0             ; andpd         0x2f7d(%rip),%xmm2        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,21,125,47,0,0            ; pand          0x2f7d(%rip),%xmm2        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,15,239,219                      ; pxor          %xmm3,%xmm3
-  DB  102,15,40,226                       ; movapd        %xmm2,%xmm4
+  DB  102,15,111,226                      ; movdqa        %xmm2,%xmm4
   DB  102,15,97,227                       ; punpcklwd     %xmm3,%xmm4
   DB  102,15,105,211                      ; punpckhwd     %xmm3,%xmm2
   DB  102,15,111,45,181,54,0,0            ; movdqa        0x36b5(%rip),%xmm5        # 2d1a0 <_sk_overlay_sse2_8bit+0x1553>
@@ -40284,9 +40282,9 @@ _sk_scale_u8_sse2_8bit LABEL PROC
   DB  15,133,239,0,0,0                    ; jne           29fe1 <_sk_scale_u8_sse2_8bit+0x110>
   DB  243,66,15,126,36,2                  ; movq          (%rdx,%r8,1),%xmm4
   DB  102,15,96,224                       ; punpcklbw     %xmm0,%xmm4
-  DB  102,15,84,37,76,43,0,0              ; andpd         0x2b4c(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,37,76,43,0,0             ; pand          0x2b4c(%rip),%xmm4        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,69,15,239,192                   ; pxor          %xmm8,%xmm8
-  DB  102,15,40,236                       ; movapd        %xmm4,%xmm5
+  DB  102,15,111,236                      ; movdqa        %xmm4,%xmm5
   DB  102,65,15,105,232                   ; punpckhwd     %xmm8,%xmm5
   DB  102,65,15,97,224                    ; punpcklwd     %xmm8,%xmm4
   DB  102,15,114,244,24                   ; pslld         $0x18,%xmm4
@@ -40470,9 +40468,9 @@ _sk_lerp_u8_sse2_8bit LABEL PROC
   DB  15,133,141,1,0,0                    ; jne           2a378 <_sk_lerp_u8_sse2_8bit+0x1ae>
   DB  243,66,15,126,44,2                  ; movq          (%rdx,%r8,1),%xmm5
   DB  102,15,96,232                       ; punpcklbw     %xmm0,%xmm5
-  DB  102,15,84,45,83,40,0,0              ; andpd         0x2853(%rip),%xmm5        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+  DB  102,15,219,45,83,40,0,0             ; pand          0x2853(%rip),%xmm5        # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
   DB  102,69,15,239,192                   ; pxor          %xmm8,%xmm8
-  DB  102,15,40,229                       ; movapd        %xmm5,%xmm4
+  DB  102,15,111,229                      ; movdqa        %xmm5,%xmm4
   DB  102,65,15,105,224                   ; punpckhwd     %xmm8,%xmm4
   DB  102,65,15,97,232                    ; punpcklwd     %xmm8,%xmm5
   DB  102,15,114,245,24                   ; pslld         $0x18,%xmm5
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..0c019f8fbc 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
  * found in the LICENSE file.
  */
 
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels.  This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
 #include "SkJumper.h"
 #include "SkJumper_misc.h"
 
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+    #include <arm_neon.h>
+#else
     #include <immintrin.h>
 #endif
 
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels.  This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
 #if !defined(JUMPER_IS_OFFLINE)
-    #error "This file must be pre-compiled."
+    #define WRAP(name) sk_##name##_8bit
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
-    #define WRAP(name) sk_##name##_vfp4_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
 
 template <typename T>
 SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
 SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
 
 SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+    return join(vqaddq_u8(a_lo, b_lo),
+                vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
     return join(_mm256_adds_epu8(a_lo, b_lo),
                 _mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
     return join(_mm_adds_epu8(a_lo, b_lo),
                 _mm_adds_epu8(a_hi, b_hi));
 #endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
 MAYBE_MSABI
 extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
                                      void** program, const SkJumper_constants*) {
-    R r;
+#if defined(JUMPER_IS_OFFLINE)
+    R r;      // Fastest to start uninitialized.
+#else
+    R r{};    // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
     auto start = (Stage*)load_and_inc(program);
     for (; y < ylimit; y++) {
         Params params = { x,y,0 };
@@ -223,6 +234,7 @@ SI V load(const T* src, size_t tail) {
     if (__builtin_expect(tail, 0)) {
         V v = 0;
         switch (tail) {
+        #if defined(__AVX2__)
             case 15: v[14] = src[14];
             case 14: v[13] = src[13];
             case 13: v[12] = src[12];
@@ -231,6 +243,7 @@ SI V load(const T* src, size_t tail) {
             case 10: v[ 9] = src[ 9];
             case  9: v[ 8] = src[ 8];
             case  8: memcpy(&v, src, 8*sizeof(T)); break;
+        #endif
             case  7: v[6] = src[6];
             case  6: v[5] = src[5];
             case  5: v[4] = src[4];
@@ -249,6 +262,7 @@ SI void store(T* dst, V v, size_t tail) {
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         switch (tail) {
+        #if defined(__AVX2__)
             case 15: dst[14] = v[14];
             case 14: dst[13] = v[13];
             case 13: dst[12] = v[12];
@@ -257,6 +271,7 @@ SI void store(T* dst, V v, size_t tail) {
             case 10: dst[ 9] = v[ 9];
             case  9: dst[ 8] = v[ 8];
             case  8: memcpy(dst, &v, 8*sizeof(T)); break;
+        #endif
             case  7: dst[6] = v[6];
             case  6: dst[5] = v[5];
             case  5: dst[4] = v[4];
@@ -461,3 +476,5 @@ STAGE(overlay) {
 //   colorburn  |
 //   colordodge  > these involve division, which makes them (much) slower than the float stages.
 //   softlight  |
+
+#endif
author	Mike Klein <mtklein@chromium.org>	2017-08-29 08:40:48 -0400
committer	Mike Klein <mtklein@chromium.org>	2017-08-29 17:04:47 +0000
commit	9d7e57d509149dd2fcb3ba73ea8f4cdce11f84bd (patch)
tree	5442beb60c037b62ebc9477742d6490fb6dcac20 /src/jumper
parent	6d13575108299951ecdfba6d85c915fcec2bc028 (diff)