aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-06-16 20:04:13 +0000
committerGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-06-16 20:04:13 +0000
commit46e2ec51010866c425712aa40edbc2897e889594 (patch)
treefa81535921f0f445eaccc0eddfd5e95c741779e9 /src
parent244929c1fc4f40740356731b7573506872ca7b90 (diff)
We extract a sub-function from MAKENAME(_nofilter_DX), only dealing with reading
one index array, indexing into src array and output to dst array. Because of the scatter-gather nature, we can not do much burst/batch reading/writing to improve the performance. We tried Neon vector instructions. We also tried the hand optimize the compiler generated assembly (non-neon) code. The latter seems to have better gain. About 6% improvements, not much though... Patch-by: Xin Qi of codeaurora.org http://codereview.appspot.com/1127042/show git-svn-id: http://skia.googlecode.com/svn/trunk@579 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r--src/core/SkBitmapProcState.cpp7
-rw-r--r--src/core/SkBitmapProcState_sample.h12
-rw-r--r--src/opts/S32_Opaque_D32_nofilter_DX_gather.S107
3 files changed, 126 insertions, 0 deletions
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index e54818d03f..2bbd777c91 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -86,7 +86,14 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) src
#define SRC_TO_FILTER(src) src
+
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && !defined(SkBitmapProcState_TEMPLATE_USE_NEON)
+#define SkBitmapProcState_TEMPLATE_USE_NEON
+#endif
#include "SkBitmapProcState_sample.h"
+#if defined(SkBitmapProcState_TEMPLATE_USE_NEON)
+#undef SkBitmapProcState_TEMPLATE_USE_NEON
+#endif
#undef FILTER_PROC
#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index 978d144b94..d3a114a670 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h
@@ -58,6 +58,14 @@ void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s,
#endif
}
+#ifdef SkBitmapProcState_TEMPLATE_USE_NEON
+extern "C" void S32_Opaque_D32_nofilter_DX_gather_neon(
+ SkPMColor* SK_RESTRICT colors,
+ const SkPMColor* SK_RESTRICT srcAddr,
+ int count,
+ const uint32_t* SK_RESTRICT xy);
+#endif
+
void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
int count, DSTTYPE* SK_RESTRICT colors) {
@@ -85,6 +93,9 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
DSTTYPE dstValue = RETURNDST(src);
BITMAPPROC_MEMSET(colors, dstValue, count);
} else {
+#ifdef SkBitmapProcState_TEMPLATE_USE_NEON
+ S32_Opaque_D32_nofilter_DX_gather_neon(colors, srcAddr, count, xy);
+#else
int i;
for (i = (count >> 2); i > 0; --i) {
uint32_t xx0 = *xy++;
@@ -104,6 +115,7 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
SkASSERT(*xx < (unsigned)s.fBitmap->width());
src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
}
+#endif // !SkBitmapProcState_TEMPLATE_USE_NEON
}
#ifdef POSTAMBLE
diff --git a/src/opts/S32_Opaque_D32_nofilter_DX_gather.S b/src/opts/S32_Opaque_D32_nofilter_DX_gather.S
new file mode 100644
index 0000000000..d3ec3442e9
--- /dev/null
+++ b/src/opts/S32_Opaque_D32_nofilter_DX_gather.S
@@ -0,0 +1,107 @@
+/***************************************************************************
+ Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+ .text
+ .fpu neon
+ .global S32_Opaque_D32_nofilter_DX_gather_neon
+ .func S32_Opaque_D32_nofilter_DX_gather_neon
+S32_Opaque_D32_nofilter_DX_gather_neon:
+// void S32_Opaque_D32_nofilter_DX_gather_neon(
+// SkPMColor* SK_RESTRICT colors,
+// const SkPMColor* SK_RESTRICT srcAddr,
+// int count,
+// const uint32_t* SK_RESTRICT xy);
+// r0: destination color buffer
+// r1: src color buffer
+// r2: count
+// r3: index array
+//
+// This function will read indexes from index array xy, and then read srcAddr
+// with the index, and put the read-out value into dest color buffer. In
+// another word, it will "gather" the value from srcAddr, indexed by xy, into
+// dest array colors.
+//
+// We tried a version using Neon instructions. We also tried to hand
+// optimize the compiler generated assembly (non-neon) code. The latter seems
+// to have better gain: we get about 6% improvements
+
+ PUSH {r0-r11,lr}
+ ASR r0,r2,#3
+ SUB sp,sp,#4 //23
+ CMP r0,#0
+ STR r0,[sp,#0] //r0 = count >> 3
+ BLE .L1_140
+ LDR r4,[sp,#4] //r4 = r0 (dst)
+ MOV r0,r3
+ ADD r12,r3,#4
+ ASR r8,r2,#3
+.L1_52:
+ LDM r3!, {r0,r6,r9,r11}
+ LSR r5,r0,#16 //30
+ LDR r5,[r1,r5,LSL #2] //30
+ LSR r7,r6,#16 //32
+ LDR r7,[r1,r7,LSL #2] //31
+ UXTH r0,r0 //34
+ LDR r0,[r1,r0,LSL #2] //34
+ UXTH r6,r6 //31
+ LDR r6,[r1,r6,LSL #2] //32
+ //STM r4!, {r0,r5,r6,r7} ;35
+ LSR r10,r9,#16 //30
+ LDR r10,[r1,r10,LSL #2] //30
+ LSR lr,r11,#16 //32
+ LDR lr,[r1,lr,LSL #2] //31
+ UXTH r9,r9 //34
+ LDR r9,[r1,r9,LSL #2] //34
+ UXTH r11,r11 //31
+ LDR r11,[r1,r11,LSL #2] //32
+ SUBS r8,r8,#1
+ STM r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35
+
+ BNE .L1_52
+
+ LDR r0,[sp,#0] // count >> 3
+ MOV r12,r0
+ LDR r0,[sp,#4] //r0 = dst
+ ADD r0,r0,r12,LSL #5 //dst += count >>3 << 5
+ STR r0,[sp,#4] //save r0 into stack again
+.L1_140:
+//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
+//;;40 for (i = (count & 7); i > 0; --i) {
+ TST r2,#7
+ BEQ .L1_184
+ LDR r0,[sp,#4] //r0 = currnt dst
+ AND r2,r2,#7
+.L1_156:
+//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width());
+//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+ LDRH r4,[r3],#2
+ ADD r12,r0,#4
+//;;43 }
+ SUBS r2,r2,#1
+ LDR r4,[r1,r4,LSL #2] //42
+ STR r4,[r0,#0] //42
+ MOV r0,r12 //42
+ BNE .L1_156
+.L1_184:
+//;;44 }
+ ADD sp,sp,#0x14
+ POP {r4-r11,pc}
+
+.endfunc
+.size S32_Opaque_D32_nofilter_DX_gather_neon, .-S32_Opaque_D32_nofilter_DX_gather_neon
+
+// ENDFUNC
+// END