diff options
author | kevin.petit <kevin.petit@arm.com> | 2014-06-26 02:51:29 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-06-26 02:51:29 -0700 |
commit | 0d448303099037d56d8f97ce87d8cae05dd9fdab (patch) | |
tree | bb33ece5a9a7be855d2bd3be7d867082147d4000 | |
parent | 1ea04a0a4d28d42e2e6031e98c2af96153ac1dad (diff) |
ARM Skia NEON patches - 40 - arm64: S32A_D565_Opaque
Here are some perf results:
+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
| 1 | -2.54% | -5.39% |
+-------+------------+------------+
| 2 | -0.66% | -2.08% |
+-------+------------+------------+
| 4 | -11.13% | 0.00% |
+-------+------------+------------+
| 8 | -5.79% | -1.30% |
+-------+------------+------------+
| 16 | 71.60% | 93.27% |
+-------+------------+------------+
| 64 | 30.99% | 57.35% |
+-------+------------+------------+
| 256 | 25.41% | 52.59% |
+-------+------------+------------+
| 1024 | 25.56% | 53.76% |
+-------+------------+------------+
Signed-off-by: Kévin PETIT <kevin.petit@arm.com>
BUG=skia:
R=mtklein@google.com, djsollen@google.com
Author: kevin.petit@arm.com
Review URL: https://codereview.chromium.org/346843003
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 97 |
1 files changed, 92 insertions, 5 deletions
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 01a6a2aa74..a8a0535baa 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -365,7 +365,98 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, ); } } -#endif + +#else // #ifdef SK_CPU_ARM32 + +void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 == alpha); + + if (count >= 16) { + asm ( + "movi v4.8h, #0x80 \t\n" + + "1: \t\n" + "sub %[count], %[count], #16 \t\n" + "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n" + "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n" + "prfm pldl1keep, [%[src],#512] \t\n" + "prfm pldl1keep, [%[dst],#256] \t\n" + "ushr v20.8h, v17.8h, #5 \t\n" + "ushr v31.8h, v16.8h, #5 \t\n" + "xtn v6.8b, v31.8h \t\n" + "xtn2 v6.16b, v20.8h \t\n" + "ushr v20.8h, v17.8h, #11 \t\n" + "shl v19.16b, v6.16b, #2 \t\n" + "ushr v31.8h, v16.8h, #11 \t\n" + "xtn v22.8b, v31.8h \t\n" + "xtn2 v22.16b, v20.8h \t\n" + "shl v18.16b, v22.16b, #3 \t\n" + "mvn v3.16b, v3.16b \t\n" + "xtn v16.8b, v16.8h \t\n" + "mov v7.16b, v4.16b \t\n" + "xtn2 v16.16b, v17.8h \t\n" + "umlal v7.8h, v3.8b, v19.8b \t\n" + "shl v16.16b, v16.16b, #3 \t\n" + "mov v22.16b, v4.16b \t\n" + "ushr v24.8h, v7.8h, #6 \t\n" + "umlal v22.8h, v3.8b, v18.8b \t\n" + "ushr v20.8h, v22.8h, #5 \t\n" + "addhn v20.8b, v22.8h, v20.8h \t\n" + "cmp %[count], #16 \t\n" + "mov v6.16b, v4.16b \t\n" + "mov v5.16b, v4.16b \t\n" + "umlal v6.8h, v3.8b, v16.8b \t\n" + "umlal2 v5.8h, v3.16b, v19.16b \t\n" + "mov v17.16b, v4.16b \t\n" + "ushr v19.8h, v6.8h, #5 \t\n" + "umlal2 v17.8h, v3.16b, v18.16b \t\n" + "addhn v7.8b, v7.8h, v24.8h \t\n" + "ushr v18.8h, v5.8h, #6 \t\n" + "ushr v21.8h, v17.8h, #5 \t\n" + "addhn2 v7.16b, v5.8h, v18.8h \t\n" + "addhn2 v20.16b, v17.8h, v21.8h \t\n" + "mov v22.16b, v4.16b \t\n" + "addhn v6.8b, v6.8h, v19.8h \t\n" + "umlal2 v22.8h, v3.16b, v16.16b \t\n" + "ushr v5.8h, v22.8h, #5 \t\n" + "addhn2 v6.16b, v22.8h, v5.8h \t\n" + "uqadd v7.16b, v1.16b, v7.16b \t\n" + "uqadd v20.16b, v2.16b, v20.16b \t\n" + "uqadd v6.16b, v0.16b, v6.16b \t\n" + "shll v22.8h, v20.8b, #8 \t\n" + "shll v5.8h, v7.8b, #8 \t\n" + "sri v22.8h, v5.8h, #5 \t\n" + "shll v17.8h, v6.8b, #8 \t\n" + "shll2 v23.8h, v20.16b, #8 \t\n" + "shll2 v7.8h, v7.16b, #8 \t\n" + "sri v22.8h, v17.8h, #11 \t\n" + "sri v23.8h, v7.8h, #5 \t\n" + "shll2 v6.8h, v6.16b, #8 \t\n" + "st1 {v22.8h}, [%[dst]], #16 \t\n" + "sri v23.8h, v6.8h, #11 \t\n" + "st1 {v23.8h}, [%[dst]], #16 \t\n" + "b.ge 1b \t\n" + : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count) + :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v31" + ); + } + // Leftovers + if (count > 0) { + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + *dst = SkSrcOver32To16(c, *dst); + } + dst += 1; + } while (--count != 0); + } +} +#endif // #ifdef SK_CPU_ARM32 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { prod += vdupq_n_u16(128); @@ -1552,11 +1643,7 @@ const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { // no dither S32_D565_Opaque_neon, S32_D565_Blend_neon, -#ifdef SK_CPU_ARM32 S32A_D565_Opaque_neon, -#else - NULL, -#endif S32A_D565_Blend_neon, // dither |