aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-07-30 02:22:31 +0000
committerGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-07-30 02:22:31 +0000
commit7d2e322beeb9361f93a7983193bdf20ac972d341 (patch)
treef1663ffd71979fbb71291e4cfc9984b70c1946f1 /src/opts
parentb6137c3139b1b1da99ad9f6c28ac0d9e8f910ff6 (diff)
add optimization table for blitproc functions
git-svn-id: http://skia.googlecode.com/svn/trunk@295 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp234
-rw-r--r--src/opts/SkBlitRow_opts_none.cpp32
2 files changed, 266 insertions, 0 deletions
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
new file mode 100644
index 0000000000..2febe43f0f
--- /dev/null
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -0,0 +1,234 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ **
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ **
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ */
+
+#include <machine/cpu-features.h>
+#include "SkBlitRow.h"
+
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 == alpha);
+
+ if (count >= 8) {
+ uint16_t* SK_RESTRICT keep_dst;
+
+ asm volatile (
+ "ands ip, %[count], #7 \n\t"
+ "vmov.u8 d31, #1<<7 \n\t"
+ "vld1.16 {q12}, [%[dst]] \n\t"
+ "vld4.8 {d0-d3}, [%[src]] \n\t"
+ "moveq ip, #8 \n\t"
+ "mov %[keep_dst], %[dst] \n\t"
+
+ "add %[src], %[src], ip, LSL#2 \n\t"
+ "add %[dst], %[dst], ip, LSL#1 \n\t"
+ "subs %[count], %[count], ip \n\t"
+ "b 9f \n\t"
+ // LOOP
+ "2: \n\t"
+
+ "vld1.16 {q12}, [%[dst]]! \n\t"
+ "vld4.8 {d0-d3}, [%[src]]! \n\t"
+ "vst1.16 {q10}, [%[keep_dst]] \n\t"
+ "sub %[keep_dst], %[dst], #8*2 \n\t"
+ "subs %[count], %[count], #8 \n\t"
+ "9: \n\t"
+ "pld [%[dst],#32] \n\t"
+ // expand 0565 q12 to 8888 {d4-d7}
+ "vmovn.u16 d4, q12 \n\t"
+ "vshr.u16 q11, q12, #5 \n\t"
+ "vshr.u16 q10, q12, #6+5 \n\t"
+ "vmovn.u16 d5, q11 \n\t"
+ "vmovn.u16 d6, q10 \n\t"
+ "vshl.u8 d4, d4, #3 \n\t"
+ "vshl.u8 d5, d5, #2 \n\t"
+ "vshl.u8 d6, d6, #3 \n\t"
+
+ "vmovl.u8 q14, d31 \n\t"
+ "vmovl.u8 q13, d31 \n\t"
+ "vmovl.u8 q12, d31 \n\t"
+
+ // duplicate in 4/2/1 & 8pix vsns
+ "vmvn.8 d30, d3 \n\t"
+ "vmlal.u8 q14, d30, d6 \n\t"
+ "vmlal.u8 q13, d30, d5 \n\t"
+ "vmlal.u8 q12, d30, d4 \n\t"
+ "vshr.u16 q8, q14, #5 \n\t"
+ "vshr.u16 q9, q13, #6 \n\t"
+ "vaddhn.u16 d6, q14, q8 \n\t"
+ "vshr.u16 q8, q12, #5 \n\t"
+ "vaddhn.u16 d5, q13, q9 \n\t"
+ "vqadd.u8 d6, d6, d0 \n\t" // moved up
+ "vaddhn.u16 d4, q12, q8 \n\t"
+ // intentionally don't calculate alpha
+ // result in d4-d6
+
+ "vqadd.u8 d5, d5, d1 \n\t"
+ "vqadd.u8 d4, d4, d2 \n\t"
+
+ // pack 8888 {d4-d6} to 0565 q10
+ "vshll.u8 q10, d6, #8 \n\t"
+ "vshll.u8 q3, d5, #8 \n\t"
+ "vshll.u8 q2, d4, #8 \n\t"
+ "vsri.u16 q10, q3, #5 \n\t"
+ "vsri.u16 q10, q2, #11 \n\t"
+
+ "bne 2b \n\t"
+
+ "1: \n\t"
+ "vst1.16 {q10}, [%[keep_dst]] \n\t"
+ : [count] "+r" (count)
+ : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+ "d30","d31"
+ );
+ }
+ else
+ { // handle count < 8
+ uint16_t* SK_RESTRICT keep_dst;
+
+ asm volatile (
+ "vmov.u8 d31, #1<<7 \n\t"
+ "mov %[keep_dst], %[dst] \n\t"
+
+ "tst %[count], #4 \n\t"
+ "beq 14f \n\t"
+ "vld1.16 {d25}, [%[dst]]! \n\t"
+ "vld1.32 {q1}, [%[src]]! \n\t"
+
+ "14: \n\t"
+ "tst %[count], #2 \n\t"
+ "beq 12f \n\t"
+ "vld1.32 {d24[1]}, [%[dst]]! \n\t"
+ "vld1.32 {d1}, [%[src]]! \n\t"
+
+ "12: \n\t"
+ "tst %[count], #1 \n\t"
+ "beq 11f \n\t"
+ "vld1.16 {d24[1]}, [%[dst]]! \n\t"
+ "vld1.32 {d0[1]}, [%[src]]! \n\t"
+
+ "11: \n\t"
+ // unzips achieve the same as a vld4 operation
+ "vuzpq.u16 q0, q1 \n\t"
+ "vuzp.u8 d0, d1 \n\t"
+ "vuzp.u8 d2, d3 \n\t"
+ // expand 0565 q12 to 8888 {d4-d7}
+ "vmovn.u16 d4, q12 \n\t"
+ "vshr.u16 q11, q12, #5 \n\t"
+ "vshr.u16 q10, q12, #6+5 \n\t"
+ "vmovn.u16 d5, q11 \n\t"
+ "vmovn.u16 d6, q10 \n\t"
+ "vshl.u8 d4, d4, #3 \n\t"
+ "vshl.u8 d5, d5, #2 \n\t"
+ "vshl.u8 d6, d6, #3 \n\t"
+
+ "vmovl.u8 q14, d31 \n\t"
+ "vmovl.u8 q13, d31 \n\t"
+ "vmovl.u8 q12, d31 \n\t"
+
+ // duplicate in 4/2/1 & 8pix vsns
+ "vmvn.8 d30, d3 \n\t"
+ "vmlal.u8 q14, d30, d6 \n\t"
+ "vmlal.u8 q13, d30, d5 \n\t"
+ "vmlal.u8 q12, d30, d4 \n\t"
+ "vshr.u16 q8, q14, #5 \n\t"
+ "vshr.u16 q9, q13, #6 \n\t"
+ "vaddhn.u16 d6, q14, q8 \n\t"
+ "vshr.u16 q8, q12, #5 \n\t"
+ "vaddhn.u16 d5, q13, q9 \n\t"
+ "vqadd.u8 d6, d6, d0 \n\t" // moved up
+ "vaddhn.u16 d4, q12, q8 \n\t"
+ // intentionally don't calculate alpha
+ // result in d4-d6
+
+ "vqadd.u8 d5, d5, d1 \n\t"
+ "vqadd.u8 d4, d4, d2 \n\t"
+
+ // pack 8888 {d4-d6} to 0565 q10
+ "vshll.u8 q10, d6, #8 \n\t"
+ "vshll.u8 q3, d5, #8 \n\t"
+ "vshll.u8 q2, d4, #8 \n\t"
+ "vsri.u16 q10, q3, #5 \n\t"
+ "vsri.u16 q10, q2, #11 \n\t"
+
+ // store
+ "tst %[count], #4 \n\t"
+ "beq 24f \n\t"
+ "vst1.16 {d21}, [%[keep_dst]]! \n\t"
+
+ "24: \n\t"
+ "tst %[count], #2 \n\t"
+ "beq 22f \n\t"
+ "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
+
+ "22: \n\t"
+ "tst %[count], #1 \n\t"
+ "beq 21f \n\t"
+ "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
+
+ "21: \n\t"
+ : [count] "+r" (count)
+ : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+ "d30","d31"
+ );
+ }
+}
+#define S32A_D565_OPAQUE_PROC S32A_D565_Opaque_neon
+#else
+#define S32A_D565_OPAQUE_PROC NULL
+#endif
+
+/* Don't have a special version that assumes each src is opaque, but our S32A
+ is still faster than the default, so use it here
+ */
+#define S32_D565_OPAQUE_PROC S32A_D565_OPAQUE_PROC
+
+///////////////////////////////////////////////////////////////////////////////
+
+const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = {
+ // no dither
+ S32_D565_OPAQUE_PROC,
+ NULL, // S32_D565_Blend,
+ S32A_D565_OPAQUE_PROC,
+ NULL, // S32A_D565_Blend,
+
+ // dither
+ NULL, // S32_D565_Opaque_Dither,
+ NULL, // S32_D565_Blend_Dither,
+ NULL, // S32A_D565_Opaque_Dither,
+ NULL, // S32A_D565_Blend_Dither
+};
+
+const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = {
+ // no dither
+ NULL, // S32_D4444_Opaque,
+ NULL, // S32_D4444_Blend,
+ NULL, // S32A_D4444_Opaque,
+ NULL, // S32A_D4444_Blend,
+
+ // dither
+ NULL, // S32_D4444_Opaque_Dither,
+ NULL, // S32_D4444_Blend_Dither,
+ NULL, // S32A_D4444_Opaque_Dither,
+ NULL, // S32A_D4444_Blend_Dither
+};
+
diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp
new file mode 100644
index 0000000000..88ab7c4203
--- /dev/null
+++ b/src/opts/SkBlitRow_opts_none.cpp
@@ -0,0 +1,32 @@
+#include "SkBlitRow.h"
+
+// Platform impl of Platform_procs with no overrides
+
+const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = {
+ // no dither
+ NULL, // S32_D565_Opaque,
+ NULL, // S32_D565_Blend,
+ NULL, // S32A_D565_Opaque,
+ NULL, // S32A_D565_Blend,
+
+ // dither
+ NULL, // S32_D565_Opaque_Dither,
+ NULL, // S32_D565_Blend_Dither,
+ NULL, // S32A_D565_Opaque_Dither,
+ NULL, // S32A_D565_Blend_Dither
+};
+
+const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = {
+ // no dither
+ NULL, // S32_D4444_Opaque,
+ NULL, // S32_D4444_Blend,
+ NULL, // S32A_D4444_Opaque,
+ NULL, // S32A_D4444_Blend,
+
+ // dither
+ NULL, // S32_D4444_Opaque_Dither,
+ NULL, // S32_D4444_Blend_Dither,
+ NULL, // S32A_D4444_Opaque_Dither,
+ NULL, // S32A_D4444_Blend_Dither
+};
+