diff options
author | robertphillips@google.com <robertphillips@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-09-04 12:48:01 +0000 |
---|---|---|
committer | robertphillips@google.com <robertphillips@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-09-04 12:48:01 +0000 |
commit | b78765e63b5de5a7dfe5f9f6813f6df81cae14ae (patch) | |
tree | 292bb289b1309e4c31981aa9a1f18586b24247a8 | |
parent | 4f55d39a175afe70c1231eb7389790633210106f (diff) |
Reverting r5364 (Update ARM and NEON optimizations for S32A_Opaque_BlitRow32)
git-svn-id: http://skia.googlecode.com/svn/trunk@5378 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r-- | bench/BitmapBench.cpp | 205 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 314 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 186 |
3 files changed, 48 insertions, 657 deletions
diff --git a/bench/BitmapBench.cpp b/bench/BitmapBench.cpp index 59ea16dde9..5f06f884a8 100644 --- a/bench/BitmapBench.cpp +++ b/bench/BitmapBench.cpp @@ -21,6 +21,25 @@ static const char* gConfigName[] = { "ERROR", "a1", "a8", "index8", "565", "4444", "8888" }; +static void drawIntoBitmap(const SkBitmap& bm) { + const int w = bm.width(); + const int h = bm.height(); + + SkCanvas canvas(bm); + SkPaint p; + p.setAntiAlias(true); + p.setColor(SK_ColorRED); + canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2, + SkIntToScalar(SkMin32(w, h))*3/8, p); + + SkRect r; + r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h)); + p.setStyle(SkPaint::kStroke_Style); + p.setStrokeWidth(SkIntToScalar(4)); + p.setColor(SK_ColorBLUE); + canvas.drawRect(r, p); +} + static int conv6ToByte(int x) { return x * 0xFF / 5; } @@ -83,23 +102,38 @@ class BitmapBench : public SkBenchmark { bool fIsOpaque; bool fForceUpdate; //bitmap marked as dirty before each draw. forces bitmap to be updated on device cache int fTileX, fTileY; // -1 means don't use shader - bool fIsVolatile; - SkBitmap::Config fConfig; SkString fName; enum { N = SkBENCHLOOP(300) }; - enum { W = 128 }; - enum { H = 128 }; public: BitmapBench(void* param, bool isOpaque, SkBitmap::Config c, bool forceUpdate = false, bool bitmapVolatile = false, int tx = -1, int ty = -1) - : INHERITED(param) - , fIsOpaque(isOpaque) - , fForceUpdate(forceUpdate) - , fIsVolatile(bitmapVolatile) - , fTileX(tx) - , fTileY(ty) - , fConfig(c) { + : INHERITED(param), fIsOpaque(isOpaque), fForceUpdate(forceUpdate), fTileX(tx), fTileY(ty) { + const int w = 128; + const int h = 128; + SkBitmap bm; + + if (SkBitmap::kIndex8_Config == c) { + bm.setConfig(SkBitmap::kARGB_8888_Config, w, h); + } else { + bm.setConfig(c, w, h); + } + bm.allocPixels(); + bm.eraseColor(isOpaque ? SK_ColorBLACK : 0); + + drawIntoBitmap(bm); + + if (SkBitmap::kIndex8_Config == c) { + convertToIndex666(bm, &fBitmap); + } else { + fBitmap = bm; + } + + if (fBitmap.getColorTable()) { + fBitmap.getColorTable()->setIsOpaque(isOpaque); + } + fBitmap.setIsOpaque(isOpaque); + fBitmap.setIsVolatile(bitmapVolatile); } protected: @@ -111,43 +145,16 @@ protected: fName.appendf("_%s", gTileName[fTileY]); } } - fName.appendf("_%s%s", gConfigName[fConfig], + fName.appendf("_%s%s", gConfigName[fBitmap.config()], fIsOpaque ? "" : "_A"); if (fForceUpdate) fName.append("_update"); - if (fIsVolatile) + if (fBitmap.isVolatile()) fName.append("_volatile"); return fName.c_str(); } - virtual void onPreDraw() { - SkBitmap bm; - - if (SkBitmap::kIndex8_Config == fConfig) { - bm.setConfig(SkBitmap::kARGB_8888_Config, W, H); - } else { - bm.setConfig(fConfig, W, H); - } - - bm.allocPixels(); - bm.eraseColor(fIsOpaque ? SK_ColorBLACK : 0); - - onDrawIntoBitmap(bm); - - if (SkBitmap::kIndex8_Config == fConfig) { - convertToIndex666(bm, &fBitmap); - } else { - fBitmap = bm; - } - - if (fBitmap.getColorTable()) { - fBitmap.getColorTable()->setIsOpaque(fIsOpaque); - } - fBitmap.setIsOpaque(fIsOpaque); - fBitmap.setIsVolatile(fIsVolatile); - } - virtual void onDraw(SkCanvas* canvas) { SkIPoint dim = this->getSize(); SkRandom rand; @@ -170,25 +177,6 @@ protected: } } - virtual void onDrawIntoBitmap(const SkBitmap& bm) { - const int w = bm.width(); - const int h = bm.height(); - - SkCanvas canvas(bm); - SkPaint p; - p.setAntiAlias(true); - p.setColor(SK_ColorRED); - canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2, - SkIntToScalar(SkMin32(w, h))*3/8, p); - - SkRect r; - r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h)); - p.setStyle(SkPaint::kStroke_Style); - p.setStrokeWidth(SkIntToScalar(4)); - p.setColor(SK_ColorBLUE); - canvas.drawRect(r, p); - } - private: typedef SkBenchmark INHERITED; }; @@ -253,95 +241,6 @@ private: typedef BitmapBench INHERITED; }; -/** Verify optimizations that test source alpha values. */ - -class SourceAlphaBitmapBench : public BitmapBench { -public: - enum SourceAlpha { kOpaque_SourceAlpha, kTransparent_SourceAlpha, - kTwoStripes_SourceAlpha, kThreeStripes_SourceAlpha}; -private: - SkString fFullName; - SourceAlpha fSourceAlpha; -public: - SourceAlphaBitmapBench(void* param, SourceAlpha alpha, SkBitmap::Config c, - bool forceUpdate = false, bool bitmapVolatile = false, - int tx = -1, int ty = -1) - : INHERITED(param, false, c, forceUpdate, bitmapVolatile, tx, ty) - , fSourceAlpha(alpha) { - } - -protected: - virtual const char* onGetName() { - fFullName.set(INHERITED::onGetName()); - - if (fSourceAlpha == kOpaque_SourceAlpha) { - fFullName.append("_source_opaque"); - } else if (fSourceAlpha == kTransparent_SourceAlpha) { - fFullName.append("_source_transparent"); - } else if (fSourceAlpha == kTwoStripes_SourceAlpha) { - fFullName.append("_source_stripes_two"); - } else if (fSourceAlpha == kThreeStripes_SourceAlpha) { - fFullName.append("_source_stripes_three"); - } - - return fFullName.c_str(); - } - - virtual void onDrawIntoBitmap(const SkBitmap& bm) SK_OVERRIDE { - const int w = bm.width(); - const int h = bm.height(); - - if (kOpaque_SourceAlpha == fSourceAlpha) { - bm.eraseColor(SK_ColorBLACK); - } else if (kTransparent_SourceAlpha == fSourceAlpha) { - bm.eraseColor(0); - } else if (kTwoStripes_SourceAlpha == fSourceAlpha) { - bm.eraseColor(0); - - SkCanvas canvas(bm); - SkPaint p; - p.setAntiAlias(false); - p.setStyle(SkPaint::kFill_Style); - p.setColor(SK_ColorRED); - - // Draw red vertical stripes on transparent background - SkRect r; - for (int x = 0; x < w; x+=2) - { - r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h)); - canvas.drawRect(r, p); - } - - } else if (kThreeStripes_SourceAlpha == fSourceAlpha) { - bm.eraseColor(0); - - SkCanvas canvas(bm); - SkPaint p; - p.setAntiAlias(false); - p.setStyle(SkPaint::kFill_Style); - - // Draw vertical stripes on transparent background with a pattern - // where the first pixel is fully transparent, the next is semi-transparent - // and the third is fully opaque. - SkRect r; - for (int x = 0; x < w; x++) - { - if (x % 3 == 0) { - continue; // Keep transparent - } else if (x % 3 == 1) { - p.setColor(SkColorSetARGB(127, 127, 127, 127)); // Semi-transparent - } else if (x % 3 == 2) { - p.setColor(SK_ColorRED); // Opaque - } - r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h)); - canvas.drawRect(r, p); - } - } - } - -private: - typedef BitmapBench INHERITED; -}; static SkBenchmark* Fact0(void* p) { return new BitmapBench(p, false, SkBitmap::kARGB_8888_Config); } static SkBenchmark* Fact1(void* p) { return new BitmapBench(p, true, SkBitmap::kARGB_8888_Config); } static SkBenchmark* Fact2(void* p) { return new BitmapBench(p, true, SkBitmap::kRGB_565_Config); } @@ -364,12 +263,6 @@ static SkBenchmark* Fact14(void* p) { return new FilterBitmapBench(p, true, SkBi static SkBenchmark* Fact15(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, true, -1, -1, true, true, true); } static SkBenchmark* Fact16(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, false, -1, -1, true, true, true); } -// source alpha tests -> S32A_Opaque_BlitRow32_{arm,neon} -static SkBenchmark* Fact17(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kOpaque_SourceAlpha, SkBitmap::kARGB_8888_Config); } -static SkBenchmark* Fact18(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTransparent_SourceAlpha, SkBitmap::kARGB_8888_Config); } -static SkBenchmark* Fact19(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTwoStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); } -static SkBenchmark* Fact20(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kThreeStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); } - static BenchRegistry gReg0(Fact0); static BenchRegistry gReg1(Fact1); static BenchRegistry gReg2(Fact2); @@ -390,7 +283,3 @@ static BenchRegistry gReg14(Fact14); static BenchRegistry gReg15(Fact15); static BenchRegistry gReg16(Fact16); -static BenchRegistry gReg17(Fact17); -static BenchRegistry gReg18(Fact18); -static BenchRegistry gReg19(Fact19); -static BenchRegistry gReg20(Fact20); diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 36bed97ccb..f6e6ba2966 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -185,306 +185,6 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" ); } - -static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_src_alpha - (SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - -/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */ -/* Predicts that the next pixel will have the same alpha type as the current pixel */ - -asm volatile ( - - "\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */ - /* we should not save r0-r3 according to ABI */ - - "\tCMP r2, #0 \n" /* if (count == 0) */ - "\tBEQ 9f \n" /* go to EXIT */ - - "\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */ - "\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */ - - "\tMOV r14, #255 \n" /* r14 = 255 */ - /* will be used later for left-side comparison */ - - "\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */ - "\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */ - "\tBGT 8f \n" /* calculated marker for 4-way -> */ - /* use simple one-by-one processing */ - - /* START OF DISPATCHING BLOCK */ - - "\t0: \n" - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */ - "\tCMP r7, r4, LSR #24 \n" - "\tCMPEQ r7, r5, LSR #24 \n" - "\tCMPEQ r7, r6, LSR #24 \n" - "\tBNE 1f \n" /* -> go to general 4-way processing routine */ - - "\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */ - "\tBEQ 3f \n" /* go to alpha == 255 optimized routine */ - - "\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */ - "\tBEQ 6f \n" /* go to alpha == 0 optimized routine */ - - /* END OF DISPATCHING BLOCK */ - - /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ - - "\t1: \n" - /* we do not have enough registers to make */ - /* 4-way [dst] loading -> we are using 2 * 2-way */ - - "\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */ - - /* PROCESSING BLOCK 1 */ - /* r3 = src, r7 = dst */ - - "\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */ - "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ - "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ - "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ - "\tMUL r9, r9, r11 \n" /* br = br * scale */ - "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ - "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ - "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ - "\tORR r7, r9, r10 \n" /* br | ag */ - "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ - - /* PROCESSING BLOCK 2 */ - /* r4 = src, r8 = dst */ - - "\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r9, r12, r8 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r8, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r8, r9, r10 \n" - "\tADD r8, r4, r8 \n" - - "\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */ - - "\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */ - - /* PROCESSING BLOCK 3 */ - /* r5 = src, r9 = dst */ - - "\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r7, r12, r9 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r8, r12, r9, LSR #8 \n" - "\tMUL r7, r7, r11 \n" - "\tAND r7, r12, r7, LSR #8 \n" - "\tMUL r8, r8, r11 \n" - "\tAND r8, r8, r12, LSL #8 \n" - "\tORR r9, r7, r8 \n" - "\tADD r9, r5, r9 \n" - - /* PROCESSING BLOCK 4 */ - /* r6 = src, r10 = dst */ - - "\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r7, r12, r10 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r8, r12, r10, LSR #8 \n" - "\tMUL r7, r7, r11 \n" - "\tAND r7, r12, r7, LSR #8 \n" - "\tMUL r8, r8, r11 \n" - "\tAND r8, r8, r12, LSL #8 \n" - "\tORR r10, r7, r8 \n" - "\tADD r10, r6, r10 \n" - - "\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */ - "\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */ - "\tBGT 8f \n" /* else -> use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ - - /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */ - - "\t2: \n" /* ENTRY 1: LOADING [src] to registers */ - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ - "\tAND r8, r5, r6 \n" - "\tAND r9, r7, r8 \n" - "\tCMP r14, r9, LSR #24 \n" - "\tBNE 4f \n" /* -> go to alpha == 0 check */ - - "\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ - - "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 2b \n" /* we could run 4-way processing */ - /* because now we're in ALPHA == 255 state */ - /* run next cycle with priority alpha == 255 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - "\t4: \n" - - "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ - "\tORR r8, r5, r6 \n" - "\tORR r9, r7, r8 \n" - "\tLSRS r9, #24 \n" - "\tBNE 1b \n" /* -> go to general processing mode */ - /* (we already checked for alpha == 255) */ - - "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 5f \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 0 state */ - /* run next cycle with priority alpha == 0 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */ - - /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */ - - "\t5: \n" /* ENTRY 1: LOADING [src] to registers */ - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ - "\tORR r8, r5, r6 \n" - "\tORR r9, r7, r8 \n" - "\tLSRS r9, #24 \n" - "\tBNE 7f \n" /* -> go to alpha == 255 check */ - - "\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ - - "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 5b \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 0 state */ - /* run next cycle with priority alpha == 0 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - "\t7: \n" - - "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ - "\tAND r8, r5, r6 \n" - "\tAND r9, r7, r8 \n" - "\tCMP r14, r9, LSR #24 \n" - "\tBNE 1b \n" /* -> go to general processing mode */ - /* (we already checked for alpha == 0) */ - - "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 2b \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 255 state */ - /* run next cycle with priority alpha == 255 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */ - - /* START OF TAIL BLOCK */ - /* (used when array is too small to be processed with 4-way algorithm)*/ - - "\t8: \n" - - "\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */ - /* we've done r2 = r2 - 16 at procedure start */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ - "\tBEQ 9f \n" /* goto EXIT */ - - /* TAIL PROCESSING BLOCK 1 */ - - "\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */ - "\tLDR r7, [%[dst]] \n" /* r7 = *dst */ - - "\tLSR r11, r3, #24 \n" /* extracting alpha from source */ - "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ - "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ - "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ - "\tMUL r9, r9, r11 \n" /* br = br * scale */ - "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ - "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ - "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ - "\tORR r7, r9, r10 \n" /* br | ag */ - "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ - - "\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ - "\tBEQ 9f \n" /* goto EXIT */ - - /* TAIL PROCESSING BLOCK 2 */ - - "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ - "\tLDR r7, [%[dst]] \n" - - "\tLSR r11, r3, #24 \n" - "\tAND r9, r12, r7 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r7, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r7, r9, r10 \n" - "\tADD r7, r3, r7 \n" - - "\tSTR r7, [%[dst]], #4 \n" - - "\tCMP %[src], r2 \n" - "\tBEQ 9f \n" - - /* TAIL PROCESSING BLOCK 3 */ - - "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ - "\tLDR r7, [%[dst]] \n" - - "\tLSR r11, r3, #24 \n" - "\tAND r9, r12, r7 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r7, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r7, r9, r10 \n" - "\tADD r7, r3, r7 \n" - - "\tSTR r7, [%[dst]], #4 \n" - - /* END OF TAIL BLOCK */ - - "\t9: \n" /* EXIT */ - - "\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */ - "\tBX lr \n" /* return */ - - : [dst] "+r" (dst), [src] "+r" (src) - : - : "cc", "r2", "r3", "memory" - - ); - -} #endif // USE_ARM_CODE /* @@ -666,21 +366,7 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm[] = { const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { NULL, // S32_Opaque, NULL, // S32_Blend, - /* - * We have two choices for S32A_Opaque procs. The one reads the src alpha - * value and attempts to optimize accordingly. The optimization is - * sensitive to the source content and is not a win in all cases. For - * example, if there are a lot of transitions between the alpha states, - * the performance will almost certainly be worse. However, for many - * common cases the performance is equivalent or better than the standard - * case where we do not inspect the src alpha. - */ -#if SK_A32_SHIFT == 24 - // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor - S32A_Opaque_BlitRow32_arm_src_alpha, // S32A_Opaque, -#else S32A_Opaque_BlitRow32_arm, // S32A_Opaque, -#endif S32A_Blend_BlitRow32_arm // S32A_Blend }; #endif diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 686c8e0476..14d59682e1 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -517,176 +517,6 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, } } -void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - SkASSERT(255 == alpha); - - if (count <= 0) - return; - - /* Use these to check if src is transparent or opaque */ - const unsigned int ALPHA_OPAQ = 0xFF000000; - const unsigned int ALPHA_TRANS = 0x00FFFFFF; - -#define UNROLL 4 - const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); - const SkPMColor* SK_RESTRICT src_temp = src; - - /* set up the NEON variables */ - uint8x8_t alpha_mask; - static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; - alpha_mask = vld1_u8(alpha_mask_setup); - - uint8x8_t src_raw, dst_raw, dst_final; - uint8x8_t src_raw_2, dst_raw_2, dst_final_2; - uint8x8_t dst_cooked; - uint16x8_t dst_wide; - uint8x8_t alpha_narrow; - uint16x8_t alpha_wide; - - /* choose the first processing type */ - if( src >= src_end) - goto TAIL; - if(*src <= ALPHA_TRANS) - goto ALPHA_0; - if(*src >= ALPHA_OPAQ) - goto ALPHA_255; - /* fall-thru */ - -ALPHA_1_TO_254: - do { - - /* get the source */ - src_raw = vreinterpret_u8_u32(vld1_u32(src)); - src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); - - /* get and hold the dst too */ - dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); - dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); - - - /* get the alphas spread out properly */ - alpha_narrow = vtbl1_u8(src_raw, alpha_mask); - /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ - /* we collapsed (255-a)+1 ... */ - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); - - /* spread the dest */ - dst_wide = vmovl_u8(dst_raw); - - /* alpha mul the dest */ - dst_wide = vmulq_u16 (dst_wide, alpha_wide); - dst_cooked = vshrn_n_u16(dst_wide, 8); - - /* sum -- ignoring any byte lane overflows */ - dst_final = vadd_u8(src_raw, dst_cooked); - - alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); - /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ - /* we collapsed (255-a)+1 ... */ - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); - - /* spread the dest */ - dst_wide = vmovl_u8(dst_raw_2); - - /* alpha mul the dest */ - dst_wide = vmulq_u16 (dst_wide, alpha_wide); - dst_cooked = vshrn_n_u16(dst_wide, 8); - - /* sum -- ignoring any byte lane overflows */ - dst_final_2 = vadd_u8(src_raw_2, dst_cooked); - - vst1_u32(dst, vreinterpret_u32_u8(dst_final)); - vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); - - src += UNROLL; - dst += UNROLL; - - /* if 2 of the next pixels aren't between 1 and 254 - it might make sense to go to the optimized loops */ - if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) - break; - - } while(src < src_end); - - if (src >= src_end) - goto TAIL; - - if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) - goto ALPHA_255; - - /*fall-thru*/ - -ALPHA_0: - - /*In this state, we know the current alpha is 0 and - we optimize for the next alpha also being zero. */ - src_temp = src; //so we don't have to increment dst every time - do { - if(*(++src) > ALPHA_TRANS) - break; - if(*(++src) > ALPHA_TRANS) - break; - if(*(++src) > ALPHA_TRANS) - break; - if(*(++src) > ALPHA_TRANS) - break; - } while(src < src_end); - - dst += (src - src_temp); - - /* no longer alpha 0, so determine where to go next. */ - if( src >= src_end) - goto TAIL; - if(*src >= ALPHA_OPAQ) - goto ALPHA_255; - else - goto ALPHA_1_TO_254; - -ALPHA_255: - while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { - dst[0]=src[0]; - dst[1]=src[1]; - dst[2]=src[2]; - dst[3]=src[3]; - src+=UNROLL; - dst+=UNROLL; - if(src >= src_end) - goto TAIL; - } - - //Handle remainder. - if(*src >= ALPHA_OPAQ) { *dst++ = *src++; - if(*src >= ALPHA_OPAQ) { *dst++ = *src++; - if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } - } - } - - if( src >= src_end) - goto TAIL; - if(*src <= ALPHA_TRANS) - goto ALPHA_0; - else - goto ALPHA_1_TO_254; - -TAIL: - /* do any residual iterations */ - src_end += UNROLL + 1; //goto the real end - while(src != src_end) { - if( *src != 0 ) { - if( *src >= ALPHA_OPAQ ) { - *dst = *src; - } - else { - *dst = SkPMSrcOver(*src, *dst); - } - } - src++; - dst++; - } - return; -} /* Neon version of S32_Blend_BlitRow32() * portable version is in src/core/SkBlitRow_D32.cpp @@ -1277,20 +1107,6 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = { const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_neon, // S32_Blend, - /* - * We have two choices for S32A_Opaque procs. The one reads the src alpha - * value and attempts to optimize accordingly. The optimization is - * sensitive to the source content and is not a win in all cases. For - * example, if there are a lot of transitions between the alpha states, - * the performance will almost certainly be worse. However, for many - * common cases the performance is equivalent or better than the standard - * case where we do not inspect the src alpha. - */ -#if SK_A32_SHIFT == 24 - // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor - S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, -#else - S32A_Opaque_BlitRow32_neon, // S32A_Opaque, -#endif + S32A_Opaque_BlitRow32_neon, // S32A_Opaque, S32A_Blend_BlitRow32_arm // S32A_Blend }; |