arm/neon optimizations for bitmap shader

original version by ARM LIMITED 2009 git-svn-id: http://skia.googlecode.com/svn/trunk@303 2bbb7eff-a529-9590-31e7-b0007b416f81
author: reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81> 2009-08-03 17:22:46 +0000
committer: reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81> 2009-08-03 17:22:46 +0000
commit: a0bd7f48aaaf2c86ba5b5de068e9d1f56e721a3c (patch)
tree: 5d8b782c97909e391ed90689a554898d4582ca48
parent: afb055ad792a1eb859fea7baf44a1bcfc4a0a11d (diff)
1 files changed, 717 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
new file mode 100644
index 0000000000..5c60731634
--- /dev/null
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -0,0 +1,717 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SkBitmapProcState.h"
+
+#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
+void S16_D16_nofilter_DX_arm(const SkBitmapProcState& s,
+                             const uint32_t* SK_RESTRICT xy,
+                             int count, uint16_t* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
+    SkASSERT(s.fDoFilter == false);
+    
+    const uint16_t* SK_RESTRICT srcAddr = (const uint16_t*)s.fBitmap->getPixels();
+    
+    // buffer is y32, x16, x16, x16, x16, x16
+    // bump srcAddr to the proper row, since we're told Y never changes
+    SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
+    srcAddr = (const uint16_t*)((const char*)srcAddr +
+                                xy[0] * s.fBitmap->rowBytes());
+    
+    uint16_t src;
+    
+    if (1 == s.fBitmap->width()) {
+        src = srcAddr[0];
+        uint16_t dstValue = src;
+        sk_memset16(colors, dstValue, count);
+    } else {
+        int i;
+        const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
+        
+        if((count >> 2) > 0) {
+            asm volatile (
+                          "mov        r8, %[count], lsr #2    \n\t"   // shift down count so we iterate in fours
+                          "1:                                     \n\t"
+                          "subs       r8, r8, #1              \n\t"   // decrement loop counter
+                          "ldrh       r4, [%[xx]], #2         \n\t"   // load xx value, update ptr
+                          "ldrh       r5, [%[xx]], #2         \n\t"   // load xx value, update ptr
+                          "ldrh       r6, [%[xx]], #2         \n\t"   // load xx value, update ptr
+                          "add        r4, r4, r4              \n\t"   // double offset for half word addressing
+                          "ldrh       r7, [%[xx]], #2         \n\t"   // load xx value, update ptr
+                          "add        r5, r5, r5              \n\t"   // double offset for half word addressing
+                          "ldrh       r4, [%[srcAddr], r4]    \n\t"   // load value from srcAddr[*xx]
+                          "add        r6, r6, r6              \n\t"   // double offset for half word addressing
+                          "ldrh       r5, [%[srcAddr], r5]    \n\t"   // load value from srcAddr[*xx]
+                          "add        r7, r7, r7              \n\t"   // double offset for half word addressing
+                          "ldrh       r6, [%[srcAddr], r6]    \n\t"   // load value from srcAddr[*xx]
+                          "ldrh       r7, [%[srcAddr], r7]    \n\t"   // load value from srcAddr[*xx]
+                          "strh       r4, [%[colors]], #2     \n\t"   // store value to colors, update ptr
+                          "strh       r5, [%[colors]], #2     \n\t"   // store value to colors, update ptr
+                          "strh       r6, [%[colors]], #2     \n\t"   // store value to colors, update ptr
+                          "strh       r7, [%[colors]], #2     \n\t"   // store value to colors, update ptr
+                          "bgt        1b                      \n\t"   // branch if loop counter > 0
+                          : [count] "+r" (count), [xx] "+r" (xx), [srcAddr] "+r" (srcAddr), [colors] "+r" (colors)
+                          :
+                          : "cc", "memory", "r4", "r5", "r6", "r7", "r8"
+                          );
+        }
+        for (i = (count & 3); i > 0; --i) {
+            SkASSERT(*xx < (unsigned)s.fBitmap->width());
+            src = srcAddr[*xx++]; *colors++ = src;
+        }
+    }
+}
+#endif //__ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
+
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+void S16_D16_filter_DX_arm(const SkBitmapProcState& s, 
+                           const uint32_t* SK_RESTRICT xy, 
+                           int count, uint16_t* SK_RESTRICT colors) 
+{
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    
+    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+    unsigned rb = s.fBitmap->rowBytes();
+    unsigned subY;
+    const uint16_t* SK_RESTRICT row0;
+    const uint16_t* SK_RESTRICT row1;
+    unsigned int rowgap;
+    const uint32_t c7ffe = 0x7ffe;
+    
+    // setup row ptrs and update proc_table
+    {
+        uint32_t XY = *xy++;
+        unsigned y0 = XY >> 14;
+        row0 = (const uint16_t*)(srcAddr + (y0 >> 4) * rb);
+        row1 = (const uint16_t*)(srcAddr + (XY & 0x3FFF) * rb);
+        rowgap = (unsigned int)row1 - (unsigned int)row0;
+        subY = y0 & 0xF;
+    }
+    
+    unsigned int count4 = ((count >> 2) << 4) | subY;
+    count &= 3;
+    
+    asm volatile (
+                  "and            r4, %[count4], #0xF             \n\t"   // mask off subY
+                  "vmov.u16       d2[0], r4                       \n\t"   // move subY to Neon
+                  "rsb            r4, r4, #16                     \n\t"   // r4 = 16-subY
+                  "vmov.u16       d2[1], r4                       \n\t"   // move 16-subY to Neon
+                  "movs           %[count4], %[count4], lsr #4    \n\t"   // shift count down, lose subY
+                  "vmov.u16       d3, #16                         \n\t"   // create constant
+                  "vmov.u16       q2, #31                         \n\t"   // set up blue mask
+                  "beq            2f                              \n\t"   // if count4 == 0, exit
+                  
+                  "1:                                                 \n\t"
+                  "ldmia          %[xy]!, {r4, r5, r6, r7}        \n\t"   // load four xy values
+                  // xy = [ x0:14 | subX:4 | x1:14 ]
+                  // extract subX for iter 0-3
+                  "vmov           d0, r4, r5                      \n\t"   // move xy to Neon, iter 0-1
+                  "vmov           d1, r6, r7                      \n\t"   // move xy to Neon, iter 2-3
+                  
+                  // Load 16 pixels for four filter iterations from memory.
+                  // Because the source pixels are potentially scattered, each lane
+                  // of each vector is loaded separately. Also, the X sub pixel
+                  // offset is extracted.
+                  
+                  // iter 0
+                  "mov            r8, r4, lsr #18                 \n\t"   // extract x0
+                  "and            r4, %[c7ffe], r4, lsl #1        \n\t"   // extract x1 and make byte offset
+                  "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
+                  "add            r4, %[row0], r4                 \n\t"   // calculate address of row0[x1]
+                  "vld1.u16       {d16[0]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
+                  "vld1.u16       {d17[0]}, [r4], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
+                  "vld1.u16       {d18[0]}, [r8]                  \n\t"   // load row1[x0]
+                  "vld1.u16       {d19[0]}, [r4]                  \n\t"   // load row1[x1]
+                  
+                  // iter 1
+                  "mov            r8, r5, lsr #18                 \n\t"   // extract x0
+                  "and            r5, %[c7ffe], r5, lsl #1        \n\t"   // extract x1 and make byte offset
+                  "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
+                  "add            r5, %[row0], r5                 \n\t"   // calculate address of row0[x1]
+                  "vld1.u16       {d16[1]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
+                  "vld1.u16       {d17[1]}, [r5], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
+                  "vld1.u16       {d18[1]}, [r8]                  \n\t"   // load row1[x0]
+                  "vld1.u16       {d19[1]}, [r5]                  \n\t"   // load row1[x1]
+                  
+                  "vshrn.u32      d0, q0, #2                      \n\t"   // shift right subX by 2 and narrow
+                  // iter 2
+                  "mov            r8, r6, lsr #18                 \n\t"   // extract x0
+                  "and            r6, %[c7ffe], r6, lsl #1        \n\t"   // extract x1 and make byte offset
+                  "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
+                  "add            r6, %[row0], r6                 \n\t"   // calculate address of row0[x1]
+                  "vld1.u16       {d16[2]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
+                  "vld1.u16       {d17[2]}, [r6], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
+                  "vld1.u16       {d18[2]}, [r8]                  \n\t"   // load row1[x0]
+                  "vld1.u16       {d19[2]}, [r6]                  \n\t"   // load row1[x1]
+                  
+                  "vshr.u16       d0, d0, #12                     \n\t"   // shift right subX to bottom 4 bits
+                  // iter 3
+                  "mov            r8, r7, lsr #18                 \n\t"   // extract x0
+                  "and            r7, %[c7ffe], r7, lsl #1        \n\t"   // extract x1 and make byte offset
+                  "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
+                  "add            r7, %[row0], r7                 \n\t"   // calculate address of row0[x1]
+                  "vld1.u16       {d16[3]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
+                  "vld1.u16       {d17[3]}, [r7], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
+                  "vld1.u16       {d18[3]}, [r8]                  \n\t"   // load row1[x0]
+                  "vld1.u16       {d19[3]}, [r7]                  \n\t"   // load row1[x1]
+                  
+                  // Registers d16-d19 now contain pixels a00-a11 for 4 iterations:
+                  //   d16 = [ a00_3 | a00_2 | a00_1 | a00_0 ]
+                  //   d17 = [ a01_3 | a01_2 | a01_1 | a01_0 ]
+                  //   d18 = [ a10_3 | a10_2 | a10_1 | a10_0 ]
+                  //   d19 = [ a11_3 | a11_2 | a11_1 | a11_0 ]
+                  //
+                  // Extract RGB channels from each 565 pixel.
+                  
+                  "vshl.i16       q11, q8, #5                     \n\t"   // shift greens to top of each lane
+                  "vand           q12, q8, q2                     \n\t"   // mask blues
+                  "vshr.u16       q10, q8, #11                    \n\t"   // shift reds to bottom of each lane
+                  "vshr.u16       q11, q11, #10                   \n\t"   // shift greens to bottom of each lane
+                  "vshl.i16       q14, q9, #5                     \n\t"   // shift greens to top of each lane
+                  "vand           q15, q9, q2                     \n\t"   // mask blues
+                  "vshr.u16       q13, q9, #11                    \n\t"   // shift reds to bottom of each lane
+                  "vshr.u16       q14, q14, #10                   \n\t"   // shift greens to bottom of each lane
+                  
+                  // There are now six Q regs, containing
+                  //   q10 = [ a01r3 | a01r2 | a01r1 | a01r0 | a00r3 | a00r2 | a00r1 | a00r0 ]
+                  //   q11 = [ a01g3 | a01g2 | a01g1 | a01g0 | a00g3 | a00g2 | a00g1 | a00g0 ]
+                  //   q12 = [ a01b3 | a01b2 | a01b1 | a01b0 | a00b3 | a00b2 | a00b1 | a00b0 ]
+                  //   q13 = [ a11r3 | a11r2 | a11r1 | a11r0 | a01r3 | a01r2 | a01r1 | a01r0 ]
+                  //   q14 = [ a11g3 | a11g2 | a11g1 | a11g0 | a01g3 | a01g2 | a01g1 | a01g0 ]
+                  //   q15 = [ a11b3 | a11b2 | a11b1 | a11b0 | a01b3 | a01b2 | a01b1 | a01b0 ]
+                  // where aXXyZ: XX = pixel position, y = colour channel, Z = iteration
+                  //   d0 = subX, d1 = 16-subX
+                  //   d2[0] = subY, d2[1] = 16-subY
+                  //   d3 = 16, q2(d4d5) = 31
+                  
+                  // The filter:
+                  //
+                  //           |        |
+                  //     ---- a00 ---- a01 ----> * (16-y)
+                  //           |        |
+                  //     -----a10 ---- a11 ----> * y
+                  //           |        |
+                  //           V        V
+                  //        * (16-x)   * x
+                  //
+                  // result = (a00.(16-y).(16-x) + a01.(16-y).x + a10.(16-x).y + a11.x.y) >> 8
+                  //
+                  
+                  "vsub.u16       d1, d3, d0                      \n\t"   // calculate 16-subX
+                  // multiply top pixel pair by (16-y)
+                  "vmul.i16       q10, q10, d2[1]                 \n\t"   // top reds multiplied by (16-y)
+                  "vmul.i16       q11, q11, d2[1]                 \n\t"   // top greens multiplied by (16-y)
+                  "vmul.i16       q12, q12, d2[1]                 \n\t"   // top blues multiplied by (16-y)
+                  // multiply bottom pixel pair by y
+                  "vmul.i16       q13, q13, d2[0]                 \n\t"   // bottom reds multiplied by y
+                  "vmul.i16       q14, q14, d2[0]                 \n\t"   // bottom greens multiplied by y
+                  "vmul.i16       q15, q15, d2[0]                 \n\t"   // bottom blues multiplied by y
+                  // mul/acc left pixels by (16-x)
+                  "vmul.i16       d16, d20, d1                    \n\t"   // resultr  = a00r * (16-x)
+                  "vmul.i16       d17, d22, d1                    \n\t"   // resultg  = a00g * (16-x)
+                  "vmul.i16       d18, d24, d1                    \n\t"   // resultb  = a00b * (16-x) 
+                  "vmla.i16       d16, d26, d1                    \n\t"   // resultr += a00r * (16-x)
+                  "vmla.i16       d17, d28, d1                    \n\t"   // resultg += a00g * (16-x)
+                  "vmla.i16       d18, d30, d1                    \n\t"   // resultb += a00b * (16-x)
+                  // mul/acc right pixels by x
+                  "vmla.i16       d16, d21, d0                    \n\t"   // resultr += a01r * x
+                  "vmla.i16       d17, d23, d0                    \n\t"   // resultg += a01g * x
+                  "vmla.i16       d18, d25, d0                    \n\t"   // resultb += a01b * x
+                  "vmla.i16       d16, d27, d0                    \n\t"   // resultr += a11r * x
+                  "vmla.i16       d17, d29, d0                    \n\t"   // resultg += a11g * x
+                  "vmla.i16       d18, d31, d0                    \n\t"   // resultb += a11b * x
+                  "subs           %[count4], %[count4], #1        \n\t"   // decrement counter
+                  // shift results down 8 bits
+                  "vshr.u16       q8, q8, #8                      \n\t"   // resultr >>= 8, resultg >>=8
+                  "vshr.u16       d18, d18, #8                    \n\t"   // resultb >>= 8
+                  // put rgb into 565
+                  "vsli.i16       d18, d17, #5                    \n\t"   // shift greens into blues
+                  "vsli.i16       d18, d16, #11                   \n\t"   // shift reds into greens and blues
+                  "vst1.i16       {d18}, [%[colors]]!             \n\t"   // store result
+                  "bgt            1b                              \n\t"   // if counter > 0, loop
+                  "2:                                                 \n\t"   // exit
+                  : [xy] "+r" (xy), [count4] "+r" (count4), [colors] "+r" (colors)
+                  : [row0] "r" (row0), [rowgap] "r" (rowgap), [c7ffe] "r" (c7ffe)
+                  : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+                  );
+    
+    while(count != 0)
+    {
+        uint32_t XX = *xy++;    // x0:14 | subX:4 | x1:14
+        unsigned x0 = XX >> 14;
+        unsigned x1 = XX & 0x3FFF;
+        unsigned subX = x0 & 0xF;
+        x0 >>= 4;
+        
+        uint32_t a00 = SkExpand_rgb_16(row0[x0]);
+        uint32_t a01 = SkExpand_rgb_16(row0[x1]);
+        uint32_t a10 = SkExpand_rgb_16(row1[x0]);
+        uint32_t a11 = SkExpand_rgb_16(row1[x1]);
+        
+        int xy = subX * subY >> 3;
+        uint32_t c = a00 * (32 - 2*subY - 2*subX + xy) +
+        a01 * (2*subX - xy) +
+        a10 * (2*subY - xy) +
+        a11 * xy;
+        
+        *colors++ = SkCompact_rgb_16(c>>5);
+        count--;
+    }
+}
+#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+
+#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s,
+                             const uint32_t* SK_RESTRICT xy,
+                             int count, uint16_t* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
+    SkASSERT(s.fDoFilter == false);
+    
+    const uint16_t* SK_RESTRICT table = s.fBitmap->getColorTable()->lock16BitCache();
+    const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels();
+    
+    // buffer is y32, x16, x16, x16, x16, x16
+    // bump srcAddr to the proper row, since we're told Y never changes
+    SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
+    srcAddr = (const uint8_t*)((const char*)srcAddr +
+                               xy[0] * s.fBitmap->rowBytes());
+    
+    uint8_t src;
+    
+    if (1 == s.fBitmap->width()) {
+        src = srcAddr[0];
+        uint16_t dstValue = table[src];
+        sk_memset16(colors, dstValue, count);
+    } else {
+        int i;
+        int count8 = count >> 3;
+        const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
+        
+        asm volatile (
+                      "cmp        %[count8], #0                   \n\t"   // compare loop counter with 0
+                      "beq        2f                              \n\t"   // if loop counter == 0, exit
+                      "1:                                             \n\t"
+                      "ldmia      %[xx]!, {r5, r7, r9, r11}       \n\t"   // load ptrs to pixels 0-7
+                      "subs       %[count8], %[count8], #1        \n\t"   // decrement loop counter
+                      "uxth       r4, r5                          \n\t"   // extract ptr 0
+                      "mov        r5, r5, lsr #16                 \n\t"   // extract ptr 1
+                      "uxth       r6, r7                          \n\t"   // extract ptr 2
+                      "mov        r7, r7, lsr #16                 \n\t"   // extract ptr 3
+                      "ldrb       r4, [%[srcAddr], r4]            \n\t"   // load pixel 0 from image
+                      "uxth       r8, r9                          \n\t"   // extract ptr 4
+                      "ldrb       r5, [%[srcAddr], r5]            \n\t"   // load pixel 1 from image
+                      "mov        r9, r9, lsr #16                 \n\t"   // extract ptr 5
+                      "ldrb       r6, [%[srcAddr], r6]            \n\t"   // load pixel 2 from image
+                      "uxth       r10, r11                        \n\t"   // extract ptr 6
+                      "ldrb       r7, [%[srcAddr], r7]            \n\t"   // load pixel 3 from image
+                      "mov        r11, r11, lsr #16               \n\t"   // extract ptr 7
+                      "ldrb       r8, [%[srcAddr], r8]            \n\t"   // load pixel 4 from image
+                      "add        r4, r4, r4                      \n\t"   // double pixel 0 for RGB565 lookup
+                      "ldrb       r9, [%[srcAddr], r9]            \n\t"   // load pixel 5 from image
+                      "add        r5, r5, r5                      \n\t"   // double pixel 1 for RGB565 lookup
+                      "ldrb       r10, [%[srcAddr], r10]          \n\t"   // load pixel 6 from image
+                      "add        r6, r6, r6                      \n\t"   // double pixel 2 for RGB565 lookup
+                      "ldrb       r11, [%[srcAddr], r11]          \n\t"   // load pixel 7 from image
+                      "add        r7, r7, r7                      \n\t"   // double pixel 3 for RGB565 lookup
+                      "ldrh       r4, [%[table], r4]              \n\t"   // load pixel 0 RGB565 from colmap
+                      "add        r8, r8, r8                      \n\t"   // double pixel 4 for RGB565 lookup
+                      "ldrh       r5, [%[table], r5]              \n\t"   // load pixel 1 RGB565 from colmap
+                      "add        r9, r9, r9                      \n\t"   // double pixel 5 for RGB565 lookup
+                      "ldrh       r6, [%[table], r6]              \n\t"   // load pixel 2 RGB565 from colmap
+                      "add        r10, r10, r10                   \n\t"   // double pixel 6 for RGB565 lookup
+                      "ldrh       r7, [%[table], r7]              \n\t"   // load pixel 3 RGB565 from colmap
+                      "add        r11, r11, r11                   \n\t"   // double pixel 7 for RGB565 lookup
+                      "ldrh       r8, [%[table], r8]              \n\t"   // load pixel 4 RGB565 from colmap
+                      "ldrh       r9, [%[table], r9]              \n\t"   // load pixel 5 RGB565 from colmap
+                      "ldrh       r10, [%[table], r10]            \n\t"   // load pixel 6 RGB565 from colmap
+                      "ldrh       r11, [%[table], r11]            \n\t"   // load pixel 7 RGB565 from colmap
+                      "pkhbt      r5, r4, r5, lsl #16             \n\t"   // pack pixels 0 and 1
+                      "pkhbt      r6, r6, r7, lsl #16             \n\t"   // pack pixels 2 and 3
+                      "pkhbt      r8, r8, r9, lsl #16             \n\t"   // pack pixels 4 and 5
+                      "pkhbt      r10, r10, r11, lsl #16          \n\t"   // pack pixels 6 and 7
+                      "stmia      %[colors]!, {r5, r6, r8, r10}   \n\t"   // store last 8 pixels
+                      "bgt        1b                              \n\t"   // loop if counter > 0
+                      "2:                                             \n\t"
+                      : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors)
+                      : [table] "r" (table), [srcAddr] "r" (srcAddr)
+                      : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+                      );
+        
+        for (i = (count & 7); i > 0; --i) {
+            src = srcAddr[*xx++]; *colors++ = table[src];
+        }
+    }
+    
+    s.fBitmap->getColorTable()->unlock16BitCache(); 
+}
+
+void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
+                                    const uint32_t* SK_RESTRICT xy,
+                                    int count, SkPMColor* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
+    SkASSERT(s.fDoFilter == false);
+    
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+    const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels();
+    
+    // buffer is y32, x16, x16, x16, x16, x16
+    // bump srcAddr to the proper row, since we're told Y never changes
+    SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
+    srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
+    
+    if (1 == s.fBitmap->width()) {
+        uint8_t src = srcAddr[0];
+        SkPMColor dstValue = table[src];
+        sk_memset32(colors, dstValue, count);
+    } else {
+        const uint16_t* xx = (const uint16_t*)(xy + 1);
+        
+        asm volatile (
+                      "subs       %[count], %[count], #8          \n\t"   // decrement count by 8, set flags
+                      "blt        2f                              \n\t"   // if count < 0, branch to singles
+                      "1:                                             \n\t"   // eights loop
+                      "ldmia      %[xx]!, {r5, r7, r9, r11}       \n\t"   // load ptrs to pixels 0-7
+                      "uxth       r4, r5                          \n\t"   // extract ptr 0
+                      "mov        r5, r5, lsr #16                 \n\t"   // extract ptr 1
+                      "uxth       r6, r7                          \n\t"   // extract ptr 2
+                      "mov        r7, r7, lsr #16                 \n\t"   // extract ptr 3
+                      "ldrb       r4, [%[srcAddr], r4]            \n\t"   // load pixel 0 from image
+                      "uxth       r8, r9                          \n\t"   // extract ptr 4
+                      "ldrb       r5, [%[srcAddr], r5]            \n\t"   // load pixel 1 from image
+                      "mov        r9, r9, lsr #16                 \n\t"   // extract ptr 5
+                      "ldrb       r6, [%[srcAddr], r6]            \n\t"   // load pixel 2 from image
+                      "uxth       r10, r11                        \n\t"   // extract ptr 6
+                      "ldrb       r7, [%[srcAddr], r7]            \n\t"   // load pixel 3 from image
+                      "mov        r11, r11, lsr #16               \n\t"   // extract ptr 7
+                      "ldrb       r8, [%[srcAddr], r8]            \n\t"   // load pixel 4 from image
+                      "ldrb       r9, [%[srcAddr], r9]            \n\t"   // load pixel 5 from image
+                      "ldrb       r10, [%[srcAddr], r10]          \n\t"   // load pixel 6 from image
+                      "ldrb       r11, [%[srcAddr], r11]          \n\t"   // load pixel 7 from image
+                      "ldr        r4, [%[table], r4, lsl #2]      \n\t"   // load pixel 0 SkPMColor from colmap
+                      "ldr        r5, [%[table], r5, lsl #2]      \n\t"   // load pixel 1 SkPMColor from colmap
+                      "ldr        r6, [%[table], r6, lsl #2]      \n\t"   // load pixel 2 SkPMColor from colmap
+                      "ldr        r7, [%[table], r7, lsl #2]      \n\t"   // load pixel 3 SkPMColor from colmap
+                      "ldr        r8, [%[table], r8, lsl #2]      \n\t"   // load pixel 4 SkPMColor from colmap
+                      "ldr        r9, [%[table], r9, lsl #2]      \n\t"   // load pixel 5 SkPMColor from colmap
+                      "ldr        r10, [%[table], r10, lsl #2]    \n\t"   // load pixel 6 SkPMColor from colmap
+                      "ldr        r11, [%[table], r11, lsl #2]    \n\t"   // load pixel 7 SkPMColor from colmap
+                      "subs       %[count], %[count], #8          \n\t"   // decrement loop counter
+                      "stmia      %[colors]!, {r4-r11}            \n\t"   // store 8 pixels
+                      "bge        1b                              \n\t"   // loop if counter >= 0
+                      "2:                                             \n\t"
+                      "adds       %[count], %[count], #8          \n\t"   // fix up counter, set flags
+                      "beq        4f                              \n\t"   // if count == 0, branch to exit
+                      "3:                                             \n\t"   // singles loop
+                      "ldrh       r4, [%[xx]], #2                 \n\t"   // load pixel ptr
+                      "subs       %[count], %[count], #1          \n\t"   // decrement loop counter
+                      "ldrb       r5, [%[srcAddr], r4]            \n\t"   // load pixel from image
+                      "ldr        r6, [%[table], r5, lsl #2]      \n\t"   // load SkPMColor from colmap
+                      "str        r6, [%[colors]], #4             \n\t"   // store pixel, update ptr
+                      "bne        3b                              \n\t"   // loop if counter != 0
+                      "4:                                             \n\t"   // exit
+                      : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors)
+                      : [table] "r" (table), [srcAddr] "r" (srcAddr)
+                      : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+                      );
+    }
+    
+    s.fBitmap->getColorTable()->unlockColors(false);
+}
+#endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+static inline void Filter_32_direct(unsigned x, unsigned y, 
+                                    SkPMColor a00, SkPMColor a01,
+                                    SkPMColor a10, SkPMColor a11,
+                                    SkPMColor *dst) {
+    asm volatile(
+                 "vdup.8         d0, %[y]                \n\t"   // duplicate y into d0
+                 "vmov.u8        d16, #16                \n\t"   // set up constant in d16
+                 "vsub.u8        d1, d16, d0             \n\t"   // d1 = 16-y
+                 
+                 "vdup.32        d4, %[a00]              \n\t"   // duplicate a00 into d4
+                 "vdup.32        d5, %[a10]              \n\t"   // duplicate a10 into d5
+                 "vmov.32        d4[1], %[a01]           \n\t"   // set top of d4 to a01
+                 "vmov.32        d5[1], %[a11]           \n\t"   // set top of d5 to a11
+                 
+                 "vmull.u8       q3, d4, d1              \n\t"   // q3 = [a01|a00] * (16-y)
+                 "vmull.u8       q0, d5, d0              \n\t"   // q0 = [a11|a10] * y
+                 
+                 "vdup.16        d5, %[x]                \n\t"   // duplicate x into d5
+                 "vmov.u16       d16, #16                \n\t"   // set up constant in d16
+                 "vsub.u16       d3, d16, d5             \n\t"   // d3 = 16-x
+                 
+                 "vmul.i16       d4, d7, d5              \n\t"   // d4  = a01 * x
+                 "vmla.i16       d4, d1, d5              \n\t"   // d4 += a11 * x
+                 "vmla.i16       d4, d6, d3              \n\t"   // d4 += a00 * (16-x)
+                 "vmla.i16       d4, d0, d3              \n\t"   // d4 += a10 * (16-x)
+                 "vshrn.i16      d0, q2, #8              \n\t"   // shift down result by 8
+                 "vst1.32        {d0[0]}, [%[dst]]       \n\t"   // store result
+                 :
+                 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst)
+                 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
+                 );
+}
+
+static inline void Filter_32_direct_alpha(unsigned x, unsigned y,
+                                          SkPMColor a00, SkPMColor a01,
+                                          SkPMColor a10, SkPMColor a11,
+                                          SkPMColor *dst, uint16_t scale) {
+    asm volatile(
+                 "vdup.8         d0, %[y]                \n\t"   // duplicate y into d0
+                 "vmov.u8        d16, #16                \n\t"   // set up constant in d16
+                 "vsub.u8        d1, d16, d0             \n\t"   // d1 = 16-y
+                 
+                 "vdup.32        d4, %[a00]              \n\t"   // duplicate a00 into d4
+                 "vdup.32        d5, %[a10]              \n\t"   // duplicate a10 into d5
+                 "vmov.32        d4[1], %[a01]           \n\t"   // set top of d4 to a01
+                 "vmov.32        d5[1], %[a11]           \n\t"   // set top of d5 to a11
+                 
+                 "vmull.u8       q3, d4, d1              \n\t"   // q3 = [a01|a00] * (16-y)
+                 "vmull.u8       q0, d5, d0              \n\t"   // q0 = [a11|a10] * y
+                 
+                 "vdup.16        d5, %[x]                \n\t"   // duplicate x into d5
+                 "vmov.u16       d16, #16                \n\t"   // set up constant in d16
+                 "vsub.u16       d3, d16, d5             \n\t"   // d3 = 16-x
+                 
+                 "vmul.i16       d4, d7, d5              \n\t"   // d4  = a01 * x
+                 "vmla.i16       d4, d1, d5              \n\t"   // d4 += a11 * x
+                 "vmla.i16       d4, d6, d3              \n\t"   // d4 += a00 * (16-x)
+                 "vmla.i16       d4, d0, d3              \n\t"   // d4 += a10 * (16-x)
+                 "vdup.16        d3, %[scale]            \n\t"   // duplicate scale into d3
+                 "vshr.u16       d4, d4, #8              \n\t"   // shift down result by 8
+                 "vmul.i16       d4, d4, d3              \n\t"   // multiply result by scale
+                 "vshrn.i16      d0, q2, #8              \n\t"   // shift down result by 8
+                 "vst1.32        {d0[0]}, [%[dst]]       \n\t"   // store result
+                 :
+                 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale)
+                 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
+                 );
+}
+
+void SI8_opaque_D32_filter_DX_arm(const SkBitmapProcState& s,
+                                  const uint32_t* SK_RESTRICT xy,
+                                  int count, SkPMColor* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+    unsigned rb = s.fBitmap->rowBytes();
+    unsigned subY;
+    const uint8_t* SK_RESTRICT row0;
+    const uint8_t* SK_RESTRICT row1;
+    
+    // setup row ptrs and update proc_table
+    {
+        uint32_t XY = *xy++;
+        unsigned y0 = XY >> 14;
+        row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
+        row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
+        subY = y0 & 0xF;
+    }
+    
+    do {
+        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
+        unsigned x0 = XX >> 14;
+        unsigned x1 = XX & 0x3FFF;
+        unsigned subX = x0 & 0xF;        
+        x0 >>= 4;
+        
+        Filter_32_direct(subX, subY, table[row0[x0]], 
+                         table[row0[x1]], 
+                         table[row1[x0]], 
+                         table[row1[x1]], colors);
+        colors++;
+    } while (--count != 0);
+    
+    s.fBitmap->getColorTable()->unlockColors(false);    
+}
+
+void SI8_opaque_D32_filter_DXDY_arm(const SkBitmapProcState& s,
+                                    const uint32_t* SK_RESTRICT xy,
+                                    int count, SkPMColor* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();        
+    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+    int rb = s.fBitmap->rowBytes();
+    
+    do {
+        uint32_t data = *xy++;
+        unsigned y0 = data >> 14;
+        unsigned y1 = data & 0x3FFF;
+        unsigned subY = y0 & 0xF;
+        y0 >>= 4;
+        
+        data = *xy++;
+        unsigned x0 = data >> 14;
+        unsigned x1 = data & 0x3FFF;
+        unsigned subX = x0 & 0xF;
+        x0 >>= 4;
+        
+        const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
+        const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
+        
+        Filter_32_direct(subX, subY, table[row0[x0]],
+                         table[row0[x1]],
+                         table[row1[x0]],
+                         table[row1[x1]], colors);
+        colors++;
+    } while (--count != 0);
+    
+    s.fBitmap->getColorTable()->unlockColors(false);    
+}
+
+void SI8_alpha_D32_filter_DX_arm(const SkBitmapProcState& s,
+                                 const uint32_t* SK_RESTRICT xy,
+                                 int count, SkPMColor* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    
+    unsigned scale = s.fAlphaScale;
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+    unsigned rb = s.fBitmap->rowBytes();
+    unsigned subY;
+    const uint8_t* SK_RESTRICT row0;
+    const uint8_t* SK_RESTRICT row1;
+    
+    // setup row ptrs and update proc_table
+    {
+        uint32_t XY = *xy++;
+        unsigned y0 = XY >> 14;
+        row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
+        row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
+        subY = y0 & 0xF;
+    }
+    
+    do {
+        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
+        unsigned x0 = XX >> 14;
+        unsigned x1 = XX & 0x3FFF;
+        unsigned subX = x0 & 0xF;        
+        x0 >>= 4;
+        
+        Filter_32_direct_alpha(subX, subY, table[row0[x0]],
+                               table[row0[x1]],
+                               table[row1[x0]],
+                               table[row1[x1]], colors, scale);
+        colors++;
+    } while (--count != 0);
+    
+    s.fBitmap->getColorTable()->unlockColors(false);    
+}
+
+void SI8_alpha_D32_filter_DXDY_arm(const SkBitmapProcState& s,
+                                   const uint32_t* SK_RESTRICT xy,
+                                   int count, SkPMColor* SK_RESTRICT colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    
+    unsigned scale = s.fAlphaScale;
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+    int rb = s.fBitmap->rowBytes();
+    
+    do {
+        uint32_t data = *xy++;
+        unsigned y0 = data >> 14;
+        unsigned y1 = data & 0x3FFF;
+        unsigned subY = y0 & 0xF;
+        y0 >>= 4;
+        
+        data = *xy++;
+        unsigned x0 = data >> 14;
+        unsigned x1 = data & 0x3FFF;
+        unsigned subX = x0 & 0xF;
+        x0 >>= 4;
+        
+        const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
+        const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
+        
+        Filter_32_direct_alpha(subX, subY, table[row0[x0]],
+                               table[row0[x1]],
+                               table[row1[x0]],
+                               table[row1[x1]], colors, scale);        
+        colors++;
+    } while (--count != 0);
+    
+    s.fBitmap->getColorTable()->unlockColors(false);    
+}
+#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+
+///////////////////////////////////////////////////////////////////////////////
+
+void SkBitmapProcState::platformProcs() {
+    bool doFilter = fDoFilter;
+    bool isOpaque = 256 == fAlphaScale;
+    bool justDx = false;
+
+    if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) {
+        justDx = true;
+    }
+
+    switch (fBitmap->config()) {
+        case SkBitmap::kRGB_565_Config:
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+            if (justDx && doFilter) {
+                fSampleProc16 = S16_D16_filter_DX_arm;
+            }
+#endif
+#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
+            if (justDx && !doFilter) {
+                fSampleProc16 = S16_D16_nofilter_DX_arm;
+            }
+#endif
+            break;  // k565
+        case SkBitmap::kIndex8_Config:
+#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+            if (justDx && !doFilter) {
+                fSampleProc16 = SI8_D16_nofilter_DX_arm;
+                if (isOpaque) {
+                    fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm;
+                }
+            }
+#endif
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+            if (doFilter) {
+                if (isOpaque) {
+                    if (justDx) {
+                        fSampleProc32 = SI8_opaque_D32_filter_DX_arm;
+                    } else {
+                        fSampleProc32 = SI8_opaque_D32_filter_DXDY_arm;
+                    }
+                } else {    // !isOpaque
+                    if (justDx) {
+                        fSampleProc32 = SI8_alpha_D32_filter_DX_arm;
+                    } else {
+                        fSampleProc32 = SI8_alpha_D32_filter_DXDY_arm;
+                    }
+                }
+            }
+#endif
+            break;  // kIndex8
+        default:
+            break;
+    }
+}
+
author	reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	2009-08-03 17:22:46 +0000
committer	reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	2009-08-03 17:22:46 +0000
commit	a0bd7f48aaaf2c86ba5b5de068e9d1f56e721a3c (patch)
tree	5d8b782c97909e391ed90689a554898d4582ca48
parent	afb055ad792a1eb859fea7baf44a1bcfc4a0a11d (diff)