diff options
author | agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-06-16 20:04:13 +0000 |
---|---|---|
committer | agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-06-16 20:04:13 +0000 |
commit | 46e2ec51010866c425712aa40edbc2897e889594 (patch) | |
tree | fa81535921f0f445eaccc0eddfd5e95c741779e9 /src | |
parent | 244929c1fc4f40740356731b7573506872ca7b90 (diff) |
We extract a sub-function from MAKENAME(_nofilter_DX), only dealing with reading
one index array, indexing into src array and output to dst array.
Because of the scatter-gather nature, we can not do much burst/batch
reading/writing to improve the performance.
We tried Neon vector instructions. We also tried the hand optimize the compiler
generated assembly (non-neon) code. The latter seems to have better gain.
About 6% improvements, not much though...
Patch-by: Xin Qi of codeaurora.org
http://codereview.appspot.com/1127042/show
git-svn-id: http://skia.googlecode.com/svn/trunk@579 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/core/SkBitmapProcState.cpp | 7 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_sample.h | 12 | ||||
-rw-r--r-- | src/opts/S32_Opaque_D32_nofilter_DX_gather.S | 107 |
3 files changed, 126 insertions, 0 deletions
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index e54818d03f..2bbd777c91 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -86,7 +86,14 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, SkASSERT(state.fAlphaScale == 256) #define RETURNDST(src) src #define SRC_TO_FILTER(src) src + +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && !defined(SkBitmapProcState_TEMPLATE_USE_NEON) +#define SkBitmapProcState_TEMPLATE_USE_NEON +#endif #include "SkBitmapProcState_sample.h" +#if defined(SkBitmapProcState_TEMPLATE_USE_NEON) +#undef SkBitmapProcState_TEMPLATE_USE_NEON +#endif #undef FILTER_PROC #define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale) diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h index 978d144b94..d3a114a670 100644 --- a/src/core/SkBitmapProcState_sample.h +++ b/src/core/SkBitmapProcState_sample.h @@ -58,6 +58,14 @@ void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, #endif } +#ifdef SkBitmapProcState_TEMPLATE_USE_NEON +extern "C" void S32_Opaque_D32_nofilter_DX_gather_neon( + SkPMColor* SK_RESTRICT colors, + const SkPMColor* SK_RESTRICT srcAddr, + int count, + const uint32_t* SK_RESTRICT xy); +#endif + void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, const uint32_t* SK_RESTRICT xy, int count, DSTTYPE* SK_RESTRICT colors) { @@ -85,6 +93,9 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, DSTTYPE dstValue = RETURNDST(src); BITMAPPROC_MEMSET(colors, dstValue, count); } else { +#ifdef SkBitmapProcState_TEMPLATE_USE_NEON + S32_Opaque_D32_nofilter_DX_gather_neon(colors, srcAddr, count, xy); +#else int i; for (i = (count >> 2); i > 0; --i) { uint32_t xx0 = *xy++; @@ -104,6 +115,7 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, SkASSERT(*xx < (unsigned)s.fBitmap->width()); src = srcAddr[*xx++]; *colors++ = RETURNDST(src); } +#endif // !SkBitmapProcState_TEMPLATE_USE_NEON } #ifdef POSTAMBLE diff --git a/src/opts/S32_Opaque_D32_nofilter_DX_gather.S b/src/opts/S32_Opaque_D32_nofilter_DX_gather.S new file mode 100644 index 0000000000..d3ec3442e9 --- /dev/null +++ b/src/opts/S32_Opaque_D32_nofilter_DX_gather.S @@ -0,0 +1,107 @@ +/*************************************************************************** + Copyright (c) 2010, Code Aurora Forum. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you + may not use this file except in compliance with the License. You may + obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied. See the License for the specific language governing + permissions and limitations under the License. + ***************************************************************************/ + + .text + .fpu neon + .global S32_Opaque_D32_nofilter_DX_gather_neon + .func S32_Opaque_D32_nofilter_DX_gather_neon +S32_Opaque_D32_nofilter_DX_gather_neon: +// void S32_Opaque_D32_nofilter_DX_gather_neon( +// SkPMColor* SK_RESTRICT colors, +// const SkPMColor* SK_RESTRICT srcAddr, +// int count, +// const uint32_t* SK_RESTRICT xy); +// r0: destination color buffer +// r1: src color buffer +// r2: count +// r3: index array +// +// This function will read indexes from index array xy, and then read srcAddr +// with the index, and put the read-out value into dest color buffer. In +// another word, it will "gather" the value from srcAddr, indexed by xy, into +// dest array colors. +// +// We tried a version using Neon instructions. We also tried to hand +// optimize the compiler generated assembly (non-neon) code. The latter seems +// to have better gain: we get about 6% improvements + + PUSH {r0-r11,lr} + ASR r0,r2,#3 + SUB sp,sp,#4 //23 + CMP r0,#0 + STR r0,[sp,#0] //r0 = count >> 3 + BLE .L1_140 + LDR r4,[sp,#4] //r4 = r0 (dst) + MOV r0,r3 + ADD r12,r3,#4 + ASR r8,r2,#3 +.L1_52: + LDM r3!, {r0,r6,r9,r11} + LSR r5,r0,#16 //30 + LDR r5,[r1,r5,LSL #2] //30 + LSR r7,r6,#16 //32 + LDR r7,[r1,r7,LSL #2] //31 + UXTH r0,r0 //34 + LDR r0,[r1,r0,LSL #2] //34 + UXTH r6,r6 //31 + LDR r6,[r1,r6,LSL #2] //32 + //STM r4!, {r0,r5,r6,r7} ;35 + LSR r10,r9,#16 //30 + LDR r10,[r1,r10,LSL #2] //30 + LSR lr,r11,#16 //32 + LDR lr,[r1,lr,LSL #2] //31 + UXTH r9,r9 //34 + LDR r9,[r1,r9,LSL #2] //34 + UXTH r11,r11 //31 + LDR r11,[r1,r11,LSL #2] //32 + SUBS r8,r8,#1 + STM r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35 + + BNE .L1_52 + + LDR r0,[sp,#0] // count >> 3 + MOV r12,r0 + LDR r0,[sp,#4] //r0 = dst + ADD r0,r0,r12,LSL #5 //dst += count >>3 << 5 + STR r0,[sp,#4] //save r0 into stack again +.L1_140: +//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy); +//;;40 for (i = (count & 7); i > 0; --i) { + TST r2,#7 + BEQ .L1_184 + LDR r0,[sp,#4] //r0 = currnt dst + AND r2,r2,#7 +.L1_156: +//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width()); +//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src); + LDRH r4,[r3],#2 + ADD r12,r0,#4 +//;;43 } + SUBS r2,r2,#1 + LDR r4,[r1,r4,LSL #2] //42 + STR r4,[r0,#0] //42 + MOV r0,r12 //42 + BNE .L1_156 +.L1_184: +//;;44 } + ADD sp,sp,#0x14 + POP {r4-r11,pc} + +.endfunc +.size S32_Opaque_D32_nofilter_DX_gather_neon, .-S32_Opaque_D32_nofilter_DX_gather_neon + +// ENDFUNC +// END |