diff options
Diffstat (limited to 'src/opts/SkBlitRow_opts_arm.cpp')
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 314 |
1 files changed, 0 insertions, 314 deletions
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 36bed97ccb..f6e6ba2966 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -185,306 +185,6 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" ); } - -static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_src_alpha - (SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - -/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */ -/* Predicts that the next pixel will have the same alpha type as the current pixel */ - -asm volatile ( - - "\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */ - /* we should not save r0-r3 according to ABI */ - - "\tCMP r2, #0 \n" /* if (count == 0) */ - "\tBEQ 9f \n" /* go to EXIT */ - - "\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */ - "\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */ - - "\tMOV r14, #255 \n" /* r14 = 255 */ - /* will be used later for left-side comparison */ - - "\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */ - "\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */ - "\tBGT 8f \n" /* calculated marker for 4-way -> */ - /* use simple one-by-one processing */ - - /* START OF DISPATCHING BLOCK */ - - "\t0: \n" - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */ - "\tCMP r7, r4, LSR #24 \n" - "\tCMPEQ r7, r5, LSR #24 \n" - "\tCMPEQ r7, r6, LSR #24 \n" - "\tBNE 1f \n" /* -> go to general 4-way processing routine */ - - "\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */ - "\tBEQ 3f \n" /* go to alpha == 255 optimized routine */ - - "\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */ - "\tBEQ 6f \n" /* go to alpha == 0 optimized routine */ - - /* END OF DISPATCHING BLOCK */ - - /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ - - "\t1: \n" - /* we do not have enough registers to make */ - /* 4-way [dst] loading -> we are using 2 * 2-way */ - - "\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */ - - /* PROCESSING BLOCK 1 */ - /* r3 = src, r7 = dst */ - - "\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */ - "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ - "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ - "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ - "\tMUL r9, r9, r11 \n" /* br = br * scale */ - "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ - "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ - "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ - "\tORR r7, r9, r10 \n" /* br | ag */ - "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ - - /* PROCESSING BLOCK 2 */ - /* r4 = src, r8 = dst */ - - "\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r9, r12, r8 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r8, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r8, r9, r10 \n" - "\tADD r8, r4, r8 \n" - - "\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */ - - "\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */ - - /* PROCESSING BLOCK 3 */ - /* r5 = src, r9 = dst */ - - "\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r7, r12, r9 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r8, r12, r9, LSR #8 \n" - "\tMUL r7, r7, r11 \n" - "\tAND r7, r12, r7, LSR #8 \n" - "\tMUL r8, r8, r11 \n" - "\tAND r8, r8, r12, LSL #8 \n" - "\tORR r9, r7, r8 \n" - "\tADD r9, r5, r9 \n" - - /* PROCESSING BLOCK 4 */ - /* r6 = src, r10 = dst */ - - "\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */ - "\tAND r7, r12, r10 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r8, r12, r10, LSR #8 \n" - "\tMUL r7, r7, r11 \n" - "\tAND r7, r12, r7, LSR #8 \n" - "\tMUL r8, r8, r11 \n" - "\tAND r8, r8, r12, LSL #8 \n" - "\tORR r10, r7, r8 \n" - "\tADD r10, r6, r10 \n" - - "\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */ - "\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */ - "\tBGT 8f \n" /* else -> use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ - - /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */ - - "\t2: \n" /* ENTRY 1: LOADING [src] to registers */ - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ - "\tAND r8, r5, r6 \n" - "\tAND r9, r7, r8 \n" - "\tCMP r14, r9, LSR #24 \n" - "\tBNE 4f \n" /* -> go to alpha == 0 check */ - - "\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ - - "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 2b \n" /* we could run 4-way processing */ - /* because now we're in ALPHA == 255 state */ - /* run next cycle with priority alpha == 255 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - "\t4: \n" - - "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ - "\tORR r8, r5, r6 \n" - "\tORR r9, r7, r8 \n" - "\tLSRS r9, #24 \n" - "\tBNE 1b \n" /* -> go to general processing mode */ - /* (we already checked for alpha == 255) */ - - "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 5f \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 0 state */ - /* run next cycle with priority alpha == 0 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */ - - /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */ - - "\t5: \n" /* ENTRY 1: LOADING [src] to registers */ - - "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ - - "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ - "\tORR r8, r5, r6 \n" - "\tORR r9, r7, r8 \n" - "\tLSRS r9, #24 \n" - "\tBNE 7f \n" /* -> go to alpha == 255 check */ - - "\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ - - "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 5b \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 0 state */ - /* run next cycle with priority alpha == 0 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - "\t7: \n" - - "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ - "\tAND r8, r5, r6 \n" - "\tAND r9, r7, r8 \n" - "\tCMP r14, r9, LSR #24 \n" - "\tBNE 1b \n" /* -> go to general processing mode */ - /* (we already checked for alpha == 0) */ - - "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ - "\tBLE 2b \n" /* we could run 4-way processing one more time */ - /* because now we're in ALPHA == 255 state */ - /* run next cycle with priority alpha == 255 checks */ - - "\tBGT 8f \n" /* if our current [src] array pointer > marker */ - /* use simple one-by-one processing */ - - /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */ - - /* START OF TAIL BLOCK */ - /* (used when array is too small to be processed with 4-way algorithm)*/ - - "\t8: \n" - - "\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */ - /* we've done r2 = r2 - 16 at procedure start */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ - "\tBEQ 9f \n" /* goto EXIT */ - - /* TAIL PROCESSING BLOCK 1 */ - - "\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */ - "\tLDR r7, [%[dst]] \n" /* r7 = *dst */ - - "\tLSR r11, r3, #24 \n" /* extracting alpha from source */ - "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ - "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ - "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ - "\tMUL r9, r9, r11 \n" /* br = br * scale */ - "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ - "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ - "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ - "\tORR r7, r9, r10 \n" /* br | ag */ - "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ - - "\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */ - - "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ - "\tBEQ 9f \n" /* goto EXIT */ - - /* TAIL PROCESSING BLOCK 2 */ - - "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ - "\tLDR r7, [%[dst]] \n" - - "\tLSR r11, r3, #24 \n" - "\tAND r9, r12, r7 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r7, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r7, r9, r10 \n" - "\tADD r7, r3, r7 \n" - - "\tSTR r7, [%[dst]], #4 \n" - - "\tCMP %[src], r2 \n" - "\tBEQ 9f \n" - - /* TAIL PROCESSING BLOCK 3 */ - - "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ - "\tLDR r7, [%[dst]] \n" - - "\tLSR r11, r3, #24 \n" - "\tAND r9, r12, r7 \n" - "\tRSB r11, r11, #256 \n" - "\tAND r10, r12, r7, LSR #8 \n" - "\tMUL r9, r9, r11 \n" - "\tAND r9, r12, r9, LSR #8 \n" - "\tMUL r10, r10, r11 \n" - "\tAND r10, r10, r12, LSL #8 \n" - "\tORR r7, r9, r10 \n" - "\tADD r7, r3, r7 \n" - - "\tSTR r7, [%[dst]], #4 \n" - - /* END OF TAIL BLOCK */ - - "\t9: \n" /* EXIT */ - - "\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */ - "\tBX lr \n" /* return */ - - : [dst] "+r" (dst), [src] "+r" (src) - : - : "cc", "r2", "r3", "memory" - - ); - -} #endif // USE_ARM_CODE /* @@ -666,21 +366,7 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm[] = { const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { NULL, // S32_Opaque, NULL, // S32_Blend, - /* - * We have two choices for S32A_Opaque procs. The one reads the src alpha - * value and attempts to optimize accordingly. The optimization is - * sensitive to the source content and is not a win in all cases. For - * example, if there are a lot of transitions between the alpha states, - * the performance will almost certainly be worse. However, for many - * common cases the performance is equivalent or better than the standard - * case where we do not inspect the src alpha. - */ -#if SK_A32_SHIFT == 24 - // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor - S32A_Opaque_BlitRow32_arm_src_alpha, // S32A_Opaque, -#else S32A_Opaque_BlitRow32_arm, // S32A_Opaque, -#endif S32A_Blend_BlitRow32_arm // S32A_Blend }; #endif |