aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar borenet@google.com <borenet@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2012-07-26 14:20:13 +0000
committerGravatar borenet@google.com <borenet@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2012-07-26 14:20:13 +0000
commit84d67151538e2c465f552121896d211b136aa150 (patch)
treed2b0a46e7cf4143ceed5beeac6786df179752b70
parenteb9568a13d247e77847dbc1acbdac7ec59cc70ee (diff)
Implementing Color32 functions for Neon platforms.
Besides the raw processing improvement provided by Neon, the code uses memory preteches (pld) which seem to improve performance greatly when dealing with very large counts. This was tested using bench where color32 accounts for the majority of the workload: bench -match rects_1 -config 8888 -repeat 500 -forceBlend 1 (the forceBlend is there so that the Color32 code does not go through the special cases where alpha == 0xFF as it would transform color32 into a sk_memset32. Numbers averaged over 3 runs: bench name | Before | Neon, no pld | Neon with pld | full boost rrects_1 | 153.9 | 128.3 | 92 | 1.66x rects_1_stroke_4| 32.8 | 31.4 | 28.45 | 1.15x rects_1 | 125.35 | 97.2 | 63.59 | 1.97x Credits: various googletv team members. Committed on behalf of evannier. Review URL: http://codereview.appspot.com/5569077/ git-svn-id: http://skia.googlecode.com/svn/trunk@4779 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp113
-rw-r--r--src/opts/SkCachePreload_arm.h34
2 files changed, 143 insertions, 4 deletions
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 80d50abaa0..7f296ef53b 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1,15 +1,18 @@
/*
- * Copyright 2009 The Android Open Source Project
+ * Copyright 2012 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
-#include "SkBlitRow.h"
#include "SkBlitMask.h"
+#include "SkBlitRow.h"
#include "SkColorPriv.h"
#include "SkDither.h"
+#include "SkUtils.h"
+
+#include "SkCachePreload_arm.h"
#if defined(__ARM_HAVE_NEON)
#include <arm_neon.h>
@@ -1256,6 +1259,105 @@ static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
#define S32_D565_Opaque_Dither_PROC NULL
#endif
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+static void Color32_neon(SkPMColor* dst, const SkPMColor* src, int count,
+ SkPMColor color) {
+ if (count <= 0) {
+ return;
+ }
+
+ if (0 == color) {
+ if (src != dst) {
+ memcpy(dst, src, count * sizeof(SkPMColor));
+ }
+ return;
+ }
+
+ unsigned colorA = SkGetPackedA32(color);
+ if (255 == colorA) {
+ sk_memset32(dst, color, count);
+ } else {
+ unsigned scale = 256 - SkAlpha255To256(colorA);
+
+ if (count >= 8) {
+ // at the end of this assembly, count will have been decremented
+ // to a negative value. That is, if count mod 8 = x, it will be
+ // -8 +x coming out.
+ asm volatile (
+ PLD128(src, 0)
+
+ "vdup.32 q0, %[color] \n\t"
+
+ PLD128(src, 128)
+
+ // scale numerical interval [0-255], so load as 8 bits
+ "vdup.8 d2, %[scale] \n\t"
+
+ PLD128(src, 256)
+
+ "subs %[count], %[count], #8 \n\t"
+
+ PLD128(src, 384)
+
+ "Loop_Color32: \n\t"
+
+ // load src color, 8 pixels, 4 64 bit registers
+ // (and increment src).
+ "vld1.32 {d4-d7}, [%[src]]! \n\t"
+
+ PLD128(src, 384)
+
+ // multiply long by scale, 64 bits at a time,
+ // destination into a 128 bit register.
+ "vmull.u8 q4, d4, d2 \n\t"
+ "vmull.u8 q5, d5, d2 \n\t"
+ "vmull.u8 q6, d6, d2 \n\t"
+ "vmull.u8 q7, d7, d2 \n\t"
+
+ // shift the 128 bit registers, containing the 16
+ // bit scaled values back to 8 bits, narrowing the
+ // results to 64 bit registers.
+ "vshrn.i16 d8, q4, #8 \n\t"
+ "vshrn.i16 d9, q5, #8 \n\t"
+ "vshrn.i16 d10, q6, #8 \n\t"
+ "vshrn.i16 d11, q7, #8 \n\t"
+
+ // adding back the color, using 128 bit registers.
+ "vadd.i8 q6, q4, q0 \n\t"
+ "vadd.i8 q7, q5, q0 \n\t"
+
+ // store back the 8 calculated pixels (2 128 bit
+ // registers), and increment dst.
+ "vst1.32 {d12-d15}, [%[dst]]! \n\t"
+
+ "subs %[count], %[count], #8 \n\t"
+ "bge Loop_Color32 \n\t"
+ : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
+ : [color] "r" (color), [scale] "r" (scale)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
+ );
+ // At this point, if we went through the inline assembly, count is
+ // a negative value:
+ // if the value is -8, there is no pixel left to process.
+ // if the value is -7, there is one pixel left to process
+ // ...
+ // And'ing it with 7 will give us the number of pixels
+ // left to process.
+ count = count & 0x7;
+ }
+
+ while (count > 0) {
+ *dst = color + SkAlphaMulQ(*src, scale);
+ src += 1;
+ dst += 1;
+ count--;
+ }
+ }
+}
+#endif
+
///////////////////////////////////////////////////////////////////////////////
static const SkBlitRow::Proc platform_565_procs[] = {
@@ -1305,12 +1407,15 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
return platform_32_procs[flags];
}
+///////////////////////////////////////////////////////////////////////////////
SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ return Color32_neon;
+#else
return NULL;
+#endif
}
-///////////////////////////////////////////////////////////////////////////////
-
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
SkMask::Format maskFormat,
SkColor color) {
diff --git a/src/opts/SkCachePreload_arm.h b/src/opts/SkCachePreload_arm.h
new file mode 100644
index 0000000000..cff8c2a9b7
--- /dev/null
+++ b/src/opts/SkCachePreload_arm.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef SkCachePreload_arm_DEFINED
+#define SkCachePreload_arm_DEFINED
+
+// This file defines macros for preload instructions for ARM. These macros
+// are designed to be embedded inside GNU inline assembly.
+// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache
+// line size also needs to be known (and needs to be contained inside
+// __ARM_CACHE_LINE_SIZE).
+#if defined(__ARM_USE_PLD)
+
+#define PLD(x, n) "pld [%["#x"], #("#n")]\n\t"
+
+#if __ARM_CACHE_LINE_SIZE == 32
+ #define PLD64(x, n) PLD(x, n) PLD(x, (n) + 32)
+#elif __ARM_CACHE_LINE_SIZE == 64
+ #define PLD64(x, n) PLD(x, n)
+#else
+ #error "unknown __ARM_CACHE_LINE_SIZE."
+#endif
+#else
+ // PLD is disabled, all macros become empty.
+ #define PLD(x, n)
+ #define PLD64(x, n)
+#endif
+
+#define PLD128(x, n) PLD64(x, n) PLD64(x, (n) + 64)
+
+#endif // SkCachePreload_arm_DEFINED