From 8dd90a926a8660da2bacc7af149f4ac5b2e7c64c Mon Sep 17 00:00:00 2001
From: "tomhudson@google.com"
 <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Date: Mon, 19 Mar 2012 13:49:50 +0000
Subject: (SSE2) acceleration for rectangular opaque erases. 15% speedup for
 rectangles < 31 px wide, 5% for larger.

http://codereview.appspot.com/5843050/


git-svn-id: http://skia.googlecode.com/svn/trunk@3423 2bbb7eff-a529-9590-31e7-b0007b416f81
---
 src/opts/SkBlitRect_opts_SSE2.cpp | 133 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 src/opts/SkBlitRect_opts_SSE2.cpp

(limited to 'src/opts/SkBlitRect_opts_SSE2.cpp')

diff --git a/src/opts/SkBlitRect_opts_SSE2.cpp b/src/opts/SkBlitRect_opts_SSE2.cpp
new file mode 100644
index 0000000000..9336951f18
--- /dev/null
+++ b/src/opts/SkBlitRect_opts_SSE2.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlitRect_opts_SSE2.h"
+#include "SkBlitRow.h"
+#include "SkColorPriv.h"
+
+#include <emmintrin.h>
+
+/** Simple blitting of opaque rectangles less than 31 pixels wide:
+    inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+*/
+void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
+                                  int width, int height,
+                                  size_t rowBytes, uint32_t color) {
+    SkASSERT(255 == SkGetPackedA32(color));
+    SkASSERT(width > 0);
+    SkASSERT(width < 31);
+
+    while (--height >= 0) {
+        SkPMColor* dst = destination;
+        int count = width;
+
+        while (count > 4) {
+            *dst++ = color;
+            *dst++ = color;
+            *dst++ = color;
+            *dst++ = color;
+            count -= 4;
+        }
+
+        while (count > 0) {
+            *dst++ = color;
+            --count;
+        }
+
+        destination = (uint32_t*)((char*)destination + rowBytes);
+    }
+}
+
+/**
+  Fast blitting of opaque rectangles at least 31 pixels wide:
+  inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+  A 31 pixel rectangle is guaranteed to have at least one
+  16-pixel aligned span that can take advantage of mm_store.
+*/
+void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
+                                int width, int height,
+                                size_t rowBytes, uint32_t color) {
+    SkASSERT(255 == SkGetPackedA32(color));
+    SkASSERT(width >= 31);
+
+    __m128i color_wide = _mm_set1_epi32(color);
+    while (--height >= 0) {
+        // Prefetching one row ahead to L1 cache can equal hardware
+        // performance for large/tall rects, but never *beats*
+        // hardware performance.
+        SkPMColor* dst = destination;
+        int count = width;
+
+        while (((size_t)dst) & 0x0F) {
+            *dst++ = color;
+            --count;
+        }
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+
+        // Googling suggests _mm_stream is only going to beat _mm_store
+        // for things that wouldn't fit in L2 cache anyway, typically
+        // >500kB, and precisely fill cache lines.  For us, with
+        // arrays > 100k elements _mm_stream is still 100%+ slower than
+        // mm_store.
+
+        // Unrolling to count >= 64 is a break-even for most
+        // input patterns; we seem to be saturating the bus and having
+        // low enough overhead at 32.
+
+        while (count >= 32) {
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            count -= 32;
+        }
+        if (count >= 16) {
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            count -= 16;
+        }
+        dst = reinterpret_cast<uint32_t*>(d);
+
+        // Unrolling the loop in the Narrow code is a significant performance
+        // gain, but unrolling this loop appears to make no difference in
+        // benchmarks with either mm_store_si128 or individual sets.
+
+        while (count > 0) {
+            *dst++ = color;
+            --count;
+        }
+
+        destination = (uint32_t*)((char*)destination + rowBytes);
+    }
+}
+
+void ColorRect32_SSE2(SkPMColor* destination,
+                      int width, int height,
+                      size_t rowBytes, uint32_t color) {
+    if (0 == height || 0 == width || 0 == color) {
+        return;
+    }
+    unsigned colorA = SkGetPackedA32(color);
+    //if (255 == colorA) {
+        //if (width < 31) {
+            //BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
+                                         //rowBytes, color);
+        //} else {
+            //BlitRect32_OpaqueWide_SSE2(destination, width, height,
+                                       //rowBytes, color);
+        //}
+    //} else {
+        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
+    //}
+}
+
-- 
cgit v1.2.3