SSE4 opaque blend using intrinsics instead of assembly.

Since we had such a hard time with the assembly versions of this blit (to the point that we have them completely disabled everywhere), I thought I'd take a shot at writing a version of the blit using intrinsics. The key feature of SSE4 we're exploiting is that we can use ptest (_mm_test*) to skip the blend when the 16 src pixels we consider each loop are all opaque or all transparent. _mm_shuffle_epi8 from SSSE3 also lends a hand to extract all those alphas. It's worth looking to see if we can backport this type of logic to SSE2 using _mm_movemask_epi8, or up to 32 pixels at a time using AVX. My local performance testing doesn't show this to be an unambiguous win (there are probably microbenchmarks and SKPs where we'd be better off just powering through the blend rather than looking at alphas), but the potential does seem tantalizing enough to let skiaperf vet it on the bots. (< 1.0x is a win.) DM says it draws pixel perfect compare to the old code. Microbenchmarks: bitmap_RGBA_8888_A_source_stripes_two 14us -> 14.4us 1.03x bitmap_RGBA_8888_A_source_stripes_three 14.3us -> 14.5us 1.01x bitmap_RGBA_8888_scale_bilerp 61.9us -> 62.2us 1.01x bitmap_RGBA_8888_update_volatile_scale_rotate_bilerp 102us -> 101us 0.99x bitmap_RGBA_8888_scale_rotate_bilerp 103us -> 101us 0.99x bitmap_RGBA_8888_scale 18.4us -> 18.2us 0.99x bitmap_RGBA_8888_A_scale_rotate_bicubic 71us -> 70us 0.99x bitmap_RGBA_8888_update_scale_rotate_bilerp 103us -> 101us 0.99x bitmap_RGBA_8888_A_scale_rotate_bilerp 112us -> 109us 0.98x bitmap_RGBA_8888_update_volatile 5.72us -> 5.58us 0.98x bitmap_RGBA_8888 5.73us -> 5.58us 0.97x bitmap_RGBA_8888_update 5.78us -> 5.6us 0.97x bitmap_RGBA_8888_A_scale_bilerp 70.7us -> 68us 0.96x bitmap_RGBA_8888_A_scale_bicubic 23.7us -> 21.8us 0.92x bitmap_RGBA_8888_A 13.9us -> 10.9us 0.78x bitmap_RGBA_8888_A_source_opaque 14us -> 6.29us 0.45x bitmap_RGBA_8888_A_source_transparent 14us -> 3.65us 0.26x Running over our ~70 SKP web page captures, this looks like we spend 0.7x the time in S32A_Opaque_BlitRow compared to the SSE2 version, which should be a decent predictor of real-world impact. BUG=chromium:399842 Committed: https://skia.googlesource.com/skia/+/04bc91b972417038fecfa87c484771eac2b9b785 CQ_EXTRA_TRYBOTS=client.skia:Test-Mac10.6-MacMini4.1-GeForce320M-x86_64-Release-Trybot Review URL: https://codereview.chromium.org/874863002
author: mtklein <mtklein@chromium.org> 2015-01-27 14:35:18 -0800
committer: Commit bot <commit-bot@chromium.org> 2015-01-27 14:35:18 -0800
commit: 6dbfb21a6c88af6d94e8c823c3ad559f1a41b493 (patch)
tree: 3fab134efd5334870b80ee7164329117c22afaf7
parent: 8ece6eb37b4f3c98587efa071ce565d26b156e9a (diff)
7 files changed, 80 insertions, 972 deletions
diff --git a/gyp/opts.gypi b/gyp/opts.gypi
index f6257a97b4..dfcf434aff 100644
--- a/gyp/opts.gypi
+++ b/gyp/opts.gypi
@@ -79,5 +79,6 @@
         ],
         'sse41_sources': [
             '<(skia_src_path)/opts/SkBlurImage_opts_SSE4.cpp',
+            '<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
         ],
 }
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
new file mode 100644
index 0000000000..fd837d54fe
--- /dev/null
+++ b/src/opts/SkBlitRow_opts_SSE4.cpp
@@ -0,0 +1,66 @@
+#include "SkBlitRow_opts_SSE4.h"
+
+// Some compilers can't compile SSSE3 or SSE4 intrinsics.  We give them stub methods.
+// The stubs should never be called, so we make them crash just to confirm that.
+#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
+void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
+    sk_throw();
+}
+
+#else
+
+#include <emmintrin.h>  // SSE2:   Most _mm_foo() in this file.
+#include <smmintrin.h>  // SSE4.1: _mm_testz_si128 and _mm_testc_si128.
+
+#include "SkColorPriv.h"
+#include "SkColor_opts_SSE2.h"
+
+void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
+                                const SkPMColor* SK_RESTRICT src,
+                                int count,
+                                U8CPU alpha) {
+    SkASSERT(alpha == 255);
+    // As long as we can, we'll work on 16 pixel pairs at once.
+    int count16 = count / 16;
+    __m128i* dst4 = (__m128i*)dst;
+    const __m128i* src4 = (const __m128i*)src;
+
+    for (int i = 0; i < count16 * 4; i += 4) {
+        // Load 16 source pixels.
+        __m128i s0 = _mm_loadu_si128(src4+i+0),
+                s1 = _mm_loadu_si128(src4+i+1),
+                s2 = _mm_loadu_si128(src4+i+2),
+                s3 = _mm_loadu_si128(src4+i+3);
+
+        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
+        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+        if (_mm_testz_si128(ORed, alphaMask)) {
+            // All 16 source pixels are fully transparent.  There's nothing to do!
+            continue;
+        }
+        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+        if (_mm_testc_si128(ANDed, alphaMask)) {
+            // All 16 source pixels are fully opaque.  There's no need to read dst or blend it.
+            _mm_storeu_si128(dst4+i+0, s0);
+            _mm_storeu_si128(dst4+i+1, s1);
+            _mm_storeu_si128(dst4+i+2, s2);
+            _mm_storeu_si128(dst4+i+3, s3);
+            continue;
+        }
+        // The general slow case: do the blend for all 16 pixels.
+        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
+        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
+        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
+        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
+    }
+
+    // Wrap up the last <= 15 pixels.
+    for (int i = count16*16; i < count; i++) {
+        // This check is not really necessarily, but it prevents pointless autovectorization.
+        if (src[i] & 0xFF000000) {
+            dst[i] = SkPMSrcOver(src[i], dst[i]);
+        }
+    }
+}
+
+#endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h
index 600e669893..577ace6f8f 100644
--- a/src/opts/SkBlitRow_opts_SSE4.h
+++ b/src/opts/SkBlitRow_opts_SSE4.h
@@ -10,24 +10,9 @@
 
 #include "SkBlitRow.h"
 
-#ifdef CRBUG_399842_FIXED
-
-/* Check if we are able to build assembly code, GCC/AT&T syntax:
- *  1) Clang and GCC are generally OK.  OS X's old LLVM-GCC 4.2 can't handle it;
- *  2) We're intentionally not linking this in even when supported (Clang) on Windows;
- *  3) MemorySanitizer cannot instrument assembly at all.
- */
-#if /* 1)*/ (defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))) \
-    /* 2)*/ && !defined(SK_BUILD_FOR_WIN)                                             \
-    /* 3)*/ && !defined(MEMORY_SANITIZER)
-extern "C" void S32A_Opaque_BlitRow32_SSE4_asm(SkPMColor* SK_RESTRICT dst,
-                                               const SkPMColor* SK_RESTRICT src,
-                                               int count, U8CPU alpha);
-
-#define SK_ATT_ASM_SUPPORTED
-#endif
-
-#endif // CRBUG_399842_FIXED
-
+void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
+                                const SkPMColor* SK_RESTRICT,
+                                int count,
+                                U8CPU alpha);
 #endif
 
diff --git a/src/opts/SkBlitRow_opts_SSE4_asm.S b/src/opts/SkBlitRow_opts_SSE4_asm.S
deleted file mode 100644
index 0f5281713d..0000000000
--- a/src/opts/SkBlitRow_opts_SSE4_asm.S
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifdef CRBUG_399842_FIXED
-
-#if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))
-
-#define CFI_PUSH(REG) \
-    .cfi_adjust_cfa_offset 4; \
-    .cfi_rel_offset REG, 0
-
-#define CFI_POP(REG) \
-    .cfi_adjust_cfa_offset -4; \
-    .cfi_restore REG
-
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG)  popl REG; CFI_POP (REG)
-#define RETURN    POP(%edi); ret
-
-#define EXTRACT_ALPHA(var1, var2) \
-    movdqa      %var1, %var2;           /* Clone source pixels to extract alpha */\
-    psrlw       $8, %var2;              /* Discard red and blue, leaving alpha and green */\
-    pshufhw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (high) */\
-    movdqa      %xmm6, %xmm4;           \
-    pshuflw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (low) */\
-    movdqa      %xmm5, %xmm3;           \
-    psubw       %var2, %xmm4            /* Finalize alpha calculations */
-
-#define SCALE_PIXELS \
-    psllw       $8, %xmm5;              /* Filter out red and blue components */\
-    pmulhuw     %xmm4, %xmm5;           /* Scale red and blue */\
-    psrlw       $8, %xmm3;              /* Filter out alpha and green components */\
-    pmullw      %xmm4, %xmm3            /* Scale alpha and green */
-
-
-/*
- * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
- *                                 const SkPMColor* SK_RESTRICT src,
- *                                 int count, U8CPU alpha)
- *
- * This function is divided into six blocks: initialization, blit 4-15 pixels,
- * blit 0-3 pixels, align destination for 16+ pixel blits,
- * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned.
- * There are some code reuse between the blocks.
- *
- * The primary optimization comes from checking the source pixels' alpha value.
- * If the alpha is zero, the pixel can be skipped entirely.
- * If the alpha is fully opaque, the pixel can be copied directly to the destination.
- * According to collected statistics, these two cases are the most common.
- * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the
- * memory latency worse-case.
- */
-
-#ifdef __clang__
-    .text
-#else
-    .section .text.sse4.2,"ax",@progbits
-    .type S32A_Opaque_BlitRow32_SSE4_asm, @function
-#endif
-    .p2align 4
-#if defined(SK_BUILD_FOR_MAC)
-    .global _S32A_Opaque_BlitRow32_SSE4_asm
-    .private_extern _S32A_Opaque_BlitRow32_SSE4_asm
-_S32A_Opaque_BlitRow32_SSE4_asm:
-#else
-    .global S32A_Opaque_BlitRow32_SSE4_asm
-    .hidden S32A_Opaque_BlitRow32_SSE4_asm
-S32A_Opaque_BlitRow32_SSE4_asm:
-#endif
-    .cfi_startproc
-    movl        8(%esp), %eax           // Source pointer
-    movl        12(%esp), %ecx          // Pixel count
-    movl        4(%esp), %edx           // Destination pointer
-    prefetcht0  (%eax)
-
-    // Setup SSE constants
-    pcmpeqd     %xmm7, %xmm7            // 0xFF000000 mask to check alpha
-    pslld       $24, %xmm7
-    pcmpeqw     %xmm6, %xmm6            // 16-bit 256 to calculate inv. alpha
-    psrlw       $15, %xmm6
-    psllw       $8, %xmm6
-    pcmpeqw     %xmm0, %xmm0            // 0x00FF00FF mask (Must be in xmm0 because of pblendvb)
-    psrlw       $8, %xmm0
-    subl        $4, %ecx                // Check if we have only 0-3 pixels
-    js          .LReallySmall
-    PUSH(%edi)
-    cmpl        $11, %ecx               // Do we have enough pixels to run the main loop?
-    ja          .LBigBlit
-
-    // Handle small blits (4-15 pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-    xorl        %edi, %edi              // Reset offset to zero
-
-.LSmallLoop:
-    lddqu       (%eax, %edi), %xmm1     // Load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LSmallAlphaNotOpaqueOrZero
-    jz          .LSmallAlphaZero        // If all alphas are zero, skip the pixels completely
-    movdqu      %xmm1, (%edx, %edi)     // Store four destination pixels
-.LSmallAlphaZero:
-    addl        $16, %edi
-    subl        $4, %ecx                // Check if there are four additional pixels, at least
-    jns         .LSmallLoop
-    jmp         .LSmallRemaining
-
-    // Handle mixed alphas (calculate and scale)
-    .p2align 4
-.LSmallAlphaNotOpaqueOrZero:
-    lddqu       (%edx, %edi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addl        $16, %edi
-    subl        $4, %ecx                // Check if there are four additional pixels, at least
-    pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqu      %xmm1, -16(%edx, %edi)  // Store four destination pixels
-    jns         .LSmallLoop
-
-    // Handle the last 0-3 pixels (also used by the main loops)
-.LSmallRemaining:
-    cmpl        $-4, %ecx               // Check if we are done
-    je          .LSmallExit
-    sall        $2, %ecx                // Calculate offset for last pixels
-    addl        %ecx, %edi
-
-    lddqu       (%eax, %edi), %xmm1     // Load last four source pixels (overlapping)
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    jc          .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping)
-    jz          .LSmallExit             // If all alphas are zero, skip the pixels completely
-
-    // Handle mixed alphas (calculate and scale)
-    lddqu       (%edx, %edi), %xmm5     // Load last four destination pixels (overlapping)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    psllw       $8, %xmm3               // Filter out red and blue components
-    pmulhuw     %xmm4, %xmm3            // Scale red and blue
-    movdqa      %xmm5, %xmm2
-    psrlw       $8, %xmm2               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm2            // Scale alpha and green
-
-    cmpl        $-8, %ecx               // Check how many pixels should be written
-    pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm2, %xmm1            // Add source and destination pixels together
-    jb          .LSmallPixelsLeft1
-    ja          .LSmallPixelsLeft3      // To avoid double-blending the overlapping pixels...
-    pblendw     $0xF0, %xmm1, %xmm5     // Merge only the final two pixels to the destination
-    movdqu      %xmm5, (%edx, %edi)     // Store last two destination pixels
-.LSmallExit:
-    RETURN
-
-.LSmallPixelsLeft1:
-    pblendw     $0xC0, %xmm1, %xmm5     // Merge only the final pixel to the destination
-    movdqu      %xmm5, (%edx, %edi)     // Store last destination pixel
-    RETURN
-
-.LSmallPixelsLeft3:
-    pblendw     $0xFC, %xmm1, %xmm5     // Merge only the final three pixels to the destination
-    movdqu      %xmm5, (%edx, %edi)     // Store last three destination pixels
-    RETURN
-
-.LSmallRemainingStoreAll:
-    movdqu      %xmm1, (%edx, %edi)     // Store last destination pixels (overwrite)
-    RETURN
-
-    // Handle really small blits (0-3 pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-.LReallySmall:
-    addl        $4, %ecx
-    jle         .LReallySmallExit
-    pcmpeqd     %xmm1, %xmm1
-    cmp         $2, %ecx                // Check how many pixels should be read
-    pinsrd      $0x0, (%eax), %xmm1     // Load one source pixel
-    pinsrd      $0x0, (%edx), %xmm5     // Load one destination pixel
-    jb          .LReallySmallCalc
-    pinsrd      $0x1, 4(%eax), %xmm1    // Load second source pixel
-    pinsrd      $0x1, 4(%edx), %xmm5    // Load second destination pixel
-    je          .LReallySmallCalc
-    pinsrd      $0x2, 8(%eax), %xmm1    // Load third source pixel
-    pinsrd      $0x2, 8(%edx), %xmm5    // Load third destination pixel
-
-.LReallySmallCalc:
-    ptest       %xmm7, %xmm1            // Check if all alphas are opaque
-    jc          .LReallySmallStore      // If all alphas are opaque, just store
-
-    // Handle mixed alphas (calculate and scale)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    pand        %xmm0, %xmm5            // Filter out red and blue components
-    pmullw      %xmm4, %xmm5            // Scale red and blue
-    psrlw       $8, %xmm3               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm3            // Scale alpha and green
-
-    psrlw       $8, %xmm5               // Combine results
-    pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-
-.LReallySmallStore:
-    cmp         $2, %ecx                // Check how many pixels should be written
-    pextrd      $0x0, %xmm1, (%edx)     // Store one destination pixel
-    jb          .LReallySmallExit
-    pextrd      $0x1, %xmm1, 4(%edx)    // Store second destination pixel
-    je          .LReallySmallExit
-    pextrd      $0x2, %xmm1, 8(%edx)    // Store third destination pixel
-.LReallySmallExit:
-    ret
-
-    // Handle bigger blit operations (16+ pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LBigBlit:
-    // Align destination?
-    testl       $0xF, %edx
-    lddqu       (%eax), %xmm1           // Pre-load four source pixels
-    jz          .LAligned
-
-    movl        %edx, %edi              // Calculate alignment of destination pointer
-    negl        %edi
-    andl        $0xF, %edi
-
-    // Handle 1-3 pixels to align destination
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    jz          .LAlignDone             // If all alphas are zero, just skip
-    lddqu       (%edx), %xmm5           // Load four destination pixels
-    jc          .LAlignStore            // If all alphas are opaque, just store
-
-    // Handle mixed alphas (calculate and scale)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    psllw       $8, %xmm3               // Filter out red and blue components
-    pmulhuw     %xmm4, %xmm3            // Scale red and blue
-    movdqa      %xmm5, %xmm2
-    psrlw       $8, %xmm2               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm2            // Scale alpha and green
-
-    pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm2, %xmm1            // Add source and destination pixels together
-
-.LAlignStore:
-    cmp         $8, %edi                // Check how many pixels should be written
-    jb          .LAlignPixelsLeft1
-    ja          .LAlignPixelsLeft3
-    pblendw     $0x0F, %xmm1, %xmm5     // Blend two pixels
-    jmp .LAlignStorePixels
-
-.LAlignPixelsLeft1:
-    pblendw     $0x03, %xmm1, %xmm5     // Blend one pixel
-    jmp .LAlignStorePixels
-
-.LAlignPixelsLeft3:
-    pblendw     $0x3F, %xmm1, %xmm5     // Blend three pixels
-
-.LAlignStorePixels:
-    movdqu      %xmm5, (%edx)           // Store destination pixels
-
-.LAlignDone:
-    addl        %edi, %eax              // Adjust pointers and pixel count
-    addl        %edi, %edx
-    shrl        $2, %edi
-    lddqu       (%eax), %xmm1           // Pre-load new source pixels (after alignment)
-    subl        %edi, %ecx
-
-.LAligned:                              // Destination is guaranteed to be 16 byte aligned
-    xorl        %edi, %edi              // Reset offset to zero
-    subl        $8, %ecx                // Decrease counter (Reserve four pixels for the cleanup)
-    testl       $0xF, %eax              // Check alignment of source pointer
-    jz          .LAlignedLoop
-
-    // Source not aligned to destination
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LUnalignedLoop:                        // Main loop for unaligned, handles eight pixels per iteration
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero00
-    lddqu       16(%eax, %edi), %xmm2   // Pre-load four source pixels
-    jz          .LAlphaZero00
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-
-.LAlphaZero00:
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero01
-    lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero01
-    movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
-
-.LAlphaZero01:
-    addl        $32, %edi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup0
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero00:
-    movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    lddqu       16(%eax, %edi), %xmm2   // Pre-load four source pixels
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-
-    // Handle next four pixels
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero01
-    lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero02
-    movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
-.LAlphaZero02:
-    addl        $32, %edi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup0
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero01:
-    movdqa      16(%edx, %edi), %xmm5   // Load four destination pixels
-    EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    lddqu       32(%eax, %edi), %xmm1   // Pre-load four source pixels
-    addl        $32, %edi
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm2            // Add source and destination pixels together
-    subl        $8, %ecx
-    movdqa      %xmm2, -16(%edx, %edi)  // Store four destination pixels
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-
-    // Cleanup - handle pending pixels from loop
-.LLoopCleanup0:
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero02
-    jz          .LAlphaZero03
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-.LAlphaZero03:
-    addl        $16, %edi
-    subl        $4, %ecx
-    js          .LSmallRemaining        // Reuse code from small loop
-
-.LRemain0:
-    lddqu       (%eax, %edi), %xmm1     // Load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero02
-    jz          .LAlphaZero04
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-.LAlphaZero04:
-    addl        $16, %edi
-    subl        $4, %ecx
-    jmp         .LSmallRemaining        // Reuse code from small loop
-
-.LAlphaNotOpaqueOrZero02:
-    movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addl        $16, %edi
-    subl        $4, %ecx
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, -16(%edx, %edi)  // Store four destination pixels
-    js          .LSmallRemaining        // Reuse code from small loop
-    jmp         .LRemain0
-
-    // Source aligned to destination
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LAlignedLoop:                          // Main loop for aligned, handles eight pixels per iteration
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero10
-    movdqa      16(%eax, %edi), %xmm2   // Pre-load four source pixels
-    jz          .LAlphaZero10
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-
-.LAlphaZero10:
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero11
-    movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero11
-    movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
-
-.LAlphaZero11:
-    addl        $32, %edi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup1
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero10:
-    movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    movdqa      16(%eax, %edi), %xmm2   // Pre-load four source pixels
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-
-    // Handle next four pixels
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero11
-    movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero12
-    movdqa      %xmm2, 16(%edx, %edi)   // Store four destination pixels
-.LAlphaZero12:
-    addl        $32, %edi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup1
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero11:
-    movdqa      16(%edx, %edi), %xmm5   // Load four destination pixels
-    EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-    movdqa      32(%eax, %edi), %xmm1   // Pre-load four source pixels
-
-    addl        $32, %edi
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm2            // Add source and destination pixels together
-    subl        $8, %ecx
-    movdqa      %xmm2, -16(%edx, %edi)  // Store four destination pixels
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-
-    // Cleanup - handle pending pixels from loop
-.LLoopCleanup1:
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero12
-    jz          .LAlphaZero13
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-.LAlphaZero13:
-    addl        $16, %edi
-    subl        $4, %ecx
-    js          .LSmallRemaining        // Reuse code from small loop
-
-.LRemain1:
-    movdqa      (%eax, %edi), %xmm1     // Load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero12
-    jz          .LAlphaZero14
-    movdqa      %xmm1, (%edx, %edi)     // Store four destination pixels
-.LAlphaZero14:
-    addl        $16, %edi
-    subl        $4, %ecx
-    jmp         .LSmallRemaining        // Reuse code from small loop
-
-.LAlphaNotOpaqueOrZero12:
-    movdqa      (%edx, %edi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addl        $16, %edi
-    subl        $4, %ecx
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, -16(%edx, %edi)  // Store four destination pixels
-    js          .LSmallRemaining        // Reuse code from small loop
-    jmp         .LRemain1
-
-    .cfi_endproc
-#ifndef __clang__
-    .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm
-#endif
-#endif
-
-#endif // CRBUG_399842_FIXED
diff --git a/src/opts/SkBlitRow_opts_SSE4_x64_asm.S b/src/opts/SkBlitRow_opts_SSE4_x64_asm.S
deleted file mode 100644
index 9a754a635b..0000000000
--- a/src/opts/SkBlitRow_opts_SSE4_x64_asm.S
+++ /dev/null
@@ -1,472 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifdef CRBUG_399842_FIXED
-
-#if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))
-
-#define EXTRACT_ALPHA(var1, var2) \
-    movdqa      %var1, %var2;           /* Clone source pixels to extract alpha */\
-    psrlw       $8, %var2;              /* Discard red and blue, leaving alpha and green */\
-    pshufhw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (high) */\
-    movdqa      %xmm6, %xmm4;           \
-    pshuflw     $0xF5, %var2, %var2;    /* Repeat alpha for scaling (low) */\
-    movdqa      %xmm5, %xmm3;           \
-    psubw       %var2, %xmm4            /* Finalize alpha calculations */
-
-#define SCALE_PIXELS \
-    psllw       $8, %xmm5;              /* Filter out red and blue components */\
-    pmulhuw     %xmm4, %xmm5;           /* Scale red and blue */\
-    psrlw       $8, %xmm3;              /* Filter out alpha and green components */\
-    pmullw      %xmm4, %xmm3            /* Scale alpha and green */
-
-
-/*
- * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
- *                                 const SkPMColor* SK_RESTRICT src,
- *                                 int count, U8CPU alpha)
- *
- * This function is divided into six blocks: initialization, blit 4-15 pixels,
- * blit 0-3 pixels, align destination for 16+ pixel blits,
- * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned.
- * There are some code reuse between the blocks.
- *
- * The primary optimization comes from checking the source pixels' alpha value.
- * If the alpha is zero, the pixel can be skipped entirely.
- * If the alpha is fully opaque, the pixel can be copied directly to the destination.
- * According to collected statistics, these two cases are the most common.
- * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the
- * memory latency worse-case.
- */
-
-#ifdef __clang__
-    .text
-#else
-    .section .text.sse4.2,"ax",@progbits
-    .type S32A_Opaque_BlitRow32_SSE4_asm, @function
-#endif
-    .p2align 4
-#if defined(SK_BUILD_FOR_MAC)
-    .global _S32A_Opaque_BlitRow32_SSE4_asm
-    .private_extern _S32A_Opaque_BlitRow32_SSE4_asm
-_S32A_Opaque_BlitRow32_SSE4_asm:
-#else
-    .global S32A_Opaque_BlitRow32_SSE4_asm
-    .hidden S32A_Opaque_BlitRow32_SSE4_asm
-S32A_Opaque_BlitRow32_SSE4_asm:
-#endif
-    .cfi_startproc
-    prefetcht0  (%rsi)
-    movl        %edx, %ecx              // Pixel count
-    movq        %rdi, %rdx              // Destination pointer
-    movq        %rsi, %rax              // Source pointer
-
-    // Setup SSE constants
-    movdqa      .LAlphaCheckMask(%rip), %xmm7  // 0xFF000000 mask to check alpha
-    movdqa      .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. alpha
-    movdqa      .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xmm0 because of pblendvb)
-
-    subl        $4, %ecx                // Check if we have only 0-3 pixels
-    js          .LReallySmall
-    cmpl        $11, %ecx               // Do we have enough pixels to run the main loop?
-    ja          .LBigBlit
-
-    // Handle small blits (4-15 pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-    xorq        %rdi, %rdi              // Reset offset to zero
-
-.LSmallLoop:
-    lddqu       (%rax, %rdi), %xmm1     // Load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LSmallAlphaNotOpaqueOrZero
-    jz          .LSmallAlphaZero
-    movdqu      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-.LSmallAlphaZero:
-    addq        $16, %rdi
-    subl        $4, %ecx                // Check if there are four additional pixels, at least
-    jns         .LSmallLoop
-    jmp         .LSmallRemaining
-
-    // Handle mixed alphas (calculate and scale)
-    .p2align 4
-.LSmallAlphaNotOpaqueOrZero:
-    lddqu       (%rdx, %rdi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addq        $16, %rdi
-    subl        $4, %ecx                // Check if there are four additional pixels, at least
-    pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqu      %xmm1, -16(%rdx, %rdi)  // Store four destination pixels
-    jns         .LSmallLoop
-
-    // Handle the last 0-3 pixels (also used by the main loops)
-.LSmallRemaining:
-    cmpl        $-4, %ecx               // Check if we are done
-    je          .LSmallExit
-    sall        $2, %ecx                // Calculate offset for last pixels
-    movslq      %ecx, %rcx
-    addq        %rcx, %rdi
-
-    lddqu       (%rax, %rdi), %xmm1     // Load last four source pixels (overlapping)
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    jc          .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping)
-    jz          .LSmallExit             // If all alphas are zero, skip the pixels completely
-
-    // Handle mixed alphas (calculate and scale)
-    lddqu       (%rdx, %rdi), %xmm5     // Load last four destination pixels (overlapping)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    psllw       $8, %xmm3               // Filter out red and blue components
-    pmulhuw     %xmm4, %xmm3            // Scale red and blue
-    movdqa      %xmm5, %xmm2
-    psrlw       $8, %xmm2               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm2            // Scale alpha and green
-
-    cmpl        $-8, %ecx               // Check how many pixels should be written
-    pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm2, %xmm1            // Add source and destination pixels together
-    jb          .LSmallPixelsLeft1
-    ja          .LSmallPixelsLeft3      // To avoid double-blending the overlapping pixels...
-    pblendw     $0xF0, %xmm1, %xmm5     // Merge only the final two pixels to the destination
-    movdqu      %xmm5, (%rdx, %rdi)     // Store last two destination pixels
-.LSmallExit:
-    ret
-
-.LSmallPixelsLeft1:
-    pblendw     $0xC0, %xmm1, %xmm5     // Merge only the final pixel to the destination
-    movdqu      %xmm5, (%rdx, %rdi)     // Store last destination pixel
-    ret
-
-.LSmallPixelsLeft3:
-    pblendw     $0xFC, %xmm1, %xmm5     // Merge only the final three pixels to the destination
-    movdqu      %xmm5, (%rdx, %rdi)     // Store last three destination pixels
-    ret
-
-.LSmallRemainingStoreAll:
-    movdqu      %xmm1, (%rdx, %rdi)     // Store last destination pixels (overwrite)
-    ret
-
-    // Handle really small blits (0-3 pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-.LReallySmall:
-    addl        $4, %ecx
-    jle         .LReallySmallExit
-    pcmpeqd     %xmm1, %xmm1
-    cmpl        $2, %ecx                // Check how many pixels should be read
-    pinsrd      $0x0, (%rax), %xmm1     // Load one source pixel
-    pinsrd      $0x0, (%rdx), %xmm5     // Load one destination pixel
-    jb          .LReallySmallCalc
-    pinsrd      $0x1, 4(%rax), %xmm1    // Load second source pixel
-    pinsrd      $0x1, 4(%rdx), %xmm5    // Load second destination pixel
-    je          .LReallySmallCalc
-    pinsrd      $0x2, 8(%rax), %xmm1    // Load third source pixel
-    pinsrd      $0x2, 8(%rdx), %xmm5    // Load third destination pixel
-
-.LReallySmallCalc:
-    ptest       %xmm7, %xmm1            // Check if all alphas are opaque
-    jc          .LReallySmallStore      // If all alphas are opaque, just store
-
-    // Handle mixed alphas (calculate and scale)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    pand        %xmm0, %xmm5            // Filter out red and blue components
-    pmullw      %xmm4, %xmm5            // Scale red and blue
-    psrlw       $8, %xmm3               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm3            // Scale alpha and green
-
-    psrlw       $8, %xmm5               // Combine results
-    pblendvb    %xmm5, %xmm3            // Mask in %xmm0, implicitly
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-
-.LReallySmallStore:
-    cmpl        $2, %ecx                // Check how many pixels should be written
-    pextrd      $0x0, %xmm1, (%rdx)     // Store one destination pixel
-    jb          .LReallySmallExit
-    pextrd      $0x1, %xmm1, 4(%rdx)    // Store second destination pixel
-    je          .LReallySmallExit
-    pextrd      $0x2, %xmm1, 8(%rdx)    // Store third destination pixel
-.LReallySmallExit:
-    ret
-
-    // Handle bigger blit operations (16+ pixels)
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LBigBlit:
-    // Align destination?
-    testl       $0xF, %edx
-    lddqu       (%rax), %xmm1           // Pre-load four source pixels
-    jz          .LAligned
-
-    movq        %rdx, %rdi              // Calculate alignment of destination pointer
-    negq        %rdi
-    andl        $0xF, %edi
-
-    // Handle 1-3 pixels to align destination
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    jz          .LAlignDone             // If all alphas are zero, just skip
-    lddqu       (%rdx), %xmm5           // Load four destination pixels
-    jc          .LAlignStore            // If all alphas are opaque, just store
-
-    // Handle mixed alphas (calculate and scale)
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-
-    psllw       $8, %xmm3               // Filter out red and blue components
-    pmulhuw     %xmm4, %xmm3            // Scale red and blue
-    movdqa      %xmm5, %xmm2
-    psrlw       $8, %xmm2               // Filter out alpha and green components
-    pmullw      %xmm4, %xmm2            // Scale alpha and green
-
-    pblendvb    %xmm3, %xmm2            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm2, %xmm1            // Add source and destination pixels together
-
-.LAlignStore:
-    cmpl        $8, %edi                // Check how many pixels should be written
-    jb          .LAlignPixelsLeft1
-    ja          .LAlignPixelsLeft3
-    pblendw     $0x0F, %xmm1, %xmm5     // Blend two pixels
-    jmp .LAlignStorePixels
-
-.LAlignPixelsLeft1:
-    pblendw     $0x03, %xmm1, %xmm5     // Blend one pixel
-    jmp .LAlignStorePixels
-
-.LAlignPixelsLeft3:
-    pblendw     $0x3F, %xmm1, %xmm5     // Blend three pixels
-
-.LAlignStorePixels:
-    movdqu      %xmm5, (%rdx)           // Store destination pixels
-
-.LAlignDone:
-    addq        %rdi, %rax              // Adjust pointers and pixel count
-    addq        %rdi, %rdx
-    shrq        $2, %rdi
-    lddqu       (%rax), %xmm1           // Pre-load new source pixels (after alignment)
-    subl        %edi, %ecx
-
-.LAligned:                              // Destination is guaranteed to be 16 byte aligned
-    xorq        %rdi, %rdi              // Reset offset to zero
-    subl        $8, %ecx                // Decrease counter (Reserve four pixels for the cleanup)
-    testl       $0xF, %eax              // Check alignment of source pointer
-    jz          .LAlignedLoop
-
-    // Source not aligned to destination
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LUnalignedLoop:                        // Main loop for unaligned, handles eight pixels per iteration
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero00
-    lddqu       16(%rax, %rdi), %xmm2   // Pre-load four source pixels
-    jz          .LAlphaZero00
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-
-.LAlphaZero00:
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero01
-    lddqu       32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero01
-    movdqa      %xmm2, 16(%rdx, %rdi)   // Store four destination pixels
-
-.LAlphaZero01:
-    addq        $32, %rdi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup0
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero00:
-    movdqa      (%rdx, %rdi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    lddqu       16(%rax, %rdi), %xmm2   // Pre-load four source pixels
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-
-    // Handle next four pixels
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero01
-    lddqu       32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero02
-    movdqa      %xmm2, 16(%rdx, %rdi)   // Store four destination pixels
-.LAlphaZero02:
-    addq        $32, %rdi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup0
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero01:
-    movdqa      16(%rdx, %rdi), %xmm5   // Load four destination pixels
-    EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    lddqu       32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-    addq        $32, %rdi
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm2            // Add source and destination pixels together
-    subl        $8, %ecx
-    movdqa      %xmm2, -16(%rdx, %rdi)  // Store four destination pixels
-    jae         .LUnalignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-
-    // Cleanup - handle pending pixels from loop
-.LLoopCleanup0:
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero02
-    jz          .LAlphaZero03
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-.LAlphaZero03:
-    addq        $16, %rdi
-    subl        $4, %ecx
-    js          .LSmallRemaining        // Reuse code from small loop
-
-.LRemain0:
-    lddqu       (%rax, %rdi), %xmm1     // Load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero02
-    jz          .LAlphaZero04
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-.LAlphaZero04:
-    addq        $16, %rdi
-    subl        $4, %ecx
-    jmp         .LSmallRemaining        // Reuse code from small loop
-
-.LAlphaNotOpaqueOrZero02:
-    movdqa      (%rdx, %rdi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addq        $16, %rdi
-    subl        $4, %ecx
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, -16(%rdx, %rdi)  // Store four destination pixels
-    js          .LSmallRemaining        // Reuse code from small loop
-    jmp         .LRemain0
-
-    // Source aligned to destination
-    ////////////////////////////////////////////////////////////////////////////////
-    .p2align 4
-.LAlignedLoop:                          // Main loop for aligned, handles eight pixels per iteration
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero10
-    movdqa      16(%rax, %rdi), %xmm2   // Pre-load four source pixels
-    jz          .LAlphaZero10
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-
-.LAlphaZero10:
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero11
-    movdqa      32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero11
-    movdqa      %xmm2, 16(%rdx, %rdi)   // Store four destination pixels
-
-.LAlphaZero11:
-    addq        $32, %rdi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup1
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero10:
-    movdqa      (%rdx, %rdi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    movdqa      16(%rax, %rdi), %xmm2   // Pre-load four source pixels
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-
-    // Handle next four pixels
-    ptest       %xmm7, %xmm2            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero11
-    movdqa      32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-    jz          .LAlphaZero12
-    movdqa      %xmm2, 16(%rdx, %rdi)   // Store four destination pixels
-.LAlphaZero12:
-    addq        $32, %rdi               // Adjust offset and pixel count
-    subl        $8, %ecx
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-    jmp         .LLoopCleanup1
-
-    .p2align 4
-.LAlphaNotOpaqueOrZero11:
-    movdqa      16(%rdx, %rdi), %xmm5   // Load four destination pixels
-    EXTRACT_ALPHA(xmm2, xmm1)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-    movdqa      32(%rax, %rdi), %xmm1   // Pre-load four source pixels
-
-    addq        $32, %rdi
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm2            // Add source and destination pixels together
-    subl        $8, %ecx
-    movdqa      %xmm2, -16(%rdx, %rdi)  // Store four destination pixels
-    jae         .LAlignedLoop
-    addl        $8, %ecx                // Adjust pixel count
-
-    // Cleanup - handle four pending pixels from loop
-.LLoopCleanup1:
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero12
-    jz          .LAlphaZero13
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-.LAlphaZero13:
-    addq        $16, %rdi
-    subl        $4, %ecx
-    js          .LSmallRemaining        // Reuse code from small loop
-
-.LRemain1:
-    movdqa      (%rax, %rdi), %xmm1     // Pre-load four source pixels
-    ptest       %xmm7, %xmm1            // Check if all alphas are zero or opaque
-    ja          .LAlphaNotOpaqueOrZero12
-    jz          .LAlphaZero14
-    movdqa      %xmm1, (%rdx, %rdi)     // Store four destination pixels
-.LAlphaZero14:
-    addq        $16, %rdi
-    subl        $4, %ecx
-    jmp         .LSmallRemaining        // Reuse code from small loop
-
-.LAlphaNotOpaqueOrZero12:
-    movdqa      (%rdx, %rdi), %xmm5     // Load four destination pixels
-    EXTRACT_ALPHA(xmm1, xmm2)           // Extract and clone alpha value
-    SCALE_PIXELS                        // Scale pixels using alpha
-
-    addq        $16, %rdi
-    subl        $4, %ecx
-    pblendvb    %xmm5, %xmm3            // Combine results (mask in %xmm0, implicitly)
-    paddb       %xmm3, %xmm1            // Add source and destination pixels together
-    movdqa      %xmm1, -16(%rdx, %rdi)  // Store four destination pixels
-    js          .LSmallRemaining        // Reuse code from small loop
-    jmp         .LRemain1
-
-    .cfi_endproc
-#ifndef __clang__
-    .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm
-#endif
-
-    // Constants for SSE code
-#ifndef __clang__
-    .section .rodata
-#endif
-    .p2align 4
-.LAlphaCheckMask:
-    .long   0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000
-.LInverseAlphaCalc:
-    .word   256, 256, 256, 256, 256, 256, 256, 256
-.LResultMergeMask:
-    .long   0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF
-#endif
-
-#endif // CRBUG_399842_FIXED
diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h
index 970abb859b..feb1d98f8d 100644
--- a/src/opts/SkColor_opts_SSE2.h
+++ b/src/opts/SkColor_opts_SSE2.h
@@ -206,7 +206,14 @@ static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
     return d_pixel;
 }
 
-// Portable version SkBlendARGB32 is in SkColorPriv.h.
+// Portable version is SkPMSrcOver in SkColorPriv.h.
+static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
+    return _mm_add_epi32(src,
+                         SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
+                                                             SkGetPackedA32_SSE2(src))));
+}
+
+// Portable version is SkBlendARGB32 in SkColorPriv.h.
 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
                                          const __m128i& aa) {
     __m128i src_scale = SkAlpha255To256_SSE2(aa);
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 71107d8756..84a4913021 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -227,21 +227,17 @@ static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
 };
 
-#if defined(SK_ATT_ASM_SUPPORTED)
 static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
     NULL,                               // S32_Opaque,
     S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    S32A_Opaque_BlitRow32_SSE4_asm,     // S32A_Opaque
+    S32A_Opaque_BlitRow32_SSE4,         // S32A_Opaque
     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
 };
-#endif
 
 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-#if defined(SK_ATT_ASM_SUPPORTED)
     if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
         return platform_32_procs_SSE4[flags];
     } else
-#endif
     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
         return platform_32_procs_SSE2[flags];
     } else {
author	mtklein <mtklein@chromium.org>	2015-01-27 14:35:18 -0800
committer	Commit bot <commit-bot@chromium.org>	2015-01-27 14:35:18 -0800
commit	6dbfb21a6c88af6d94e8c823c3ad559f1a41b493 (patch)
tree	3fab134efd5334870b80ee7164329117c22afaf7
parent	8ece6eb37b4f3c98587efa071ce565d26b156e9a (diff)