diff options
author | 2017-03-07 00:54:47 +0000 | |
---|---|---|
committer | 2017-03-07 00:55:02 +0000 | |
commit | 4b19b403944dd4ab70507c0dea2aa3d38f145eac (patch) | |
tree | 558cff9e0fede4dd57f1a7c9d15ef92f44d6f4a0 | |
parent | 90165c2269bc33ca3d6aaa73d528194daf48da4e (diff) |
Revert "Fix new IT blocks ARMv8"
This reverts commit 90165c2269bc33ca3d6aaa73d528194daf48da4e.
Reason for revert: Skia and Chrome iOS builds broken.
../../third_party/skia/include/private/SkFixed.h:106:41: error: invalid output constraint '+t' in asm
asm("vcvt.s32.f32 %0, %0, #16": "+t"(x));
Original change's description:
> Fix new IT blocks ARMv8
>
> ARMv8 specifies that an IT block should be followed by only one 16-bit instruction.
> * SkFloatToFix is back to a C implementation that mirrors the assembly code.
>
> * S32A_D565_Opaque_neon switched the usage of the temporary 'ip' register to let
> the compiler choose what is best in the context of the IT block. And replaced
> 'keep_dst' by 'ip' where low register or high register does not matter.
>
> BUG=skia:
>
> CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD
>
> Change-Id: I096759841c972e9300c1d0293bc80d3c3ff2747b
> Reviewed-on: https://skia-review.googlesource.com/9340
> Reviewed-by: Mike Klein <mtklein@chromium.org>
> Commit-Queue: Mike Klein <mtklein@chromium.org>
>
TBR=mtklein@chromium.org,amaury.leleyzour@arm.com,reviews@skia.org
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:
CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD
Change-Id: Idbcbda88039066153e1c34233d43366ab114fd01
Reviewed-on: https://skia-review.googlesource.com/9332
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
-rw-r--r-- | include/private/SkFixed.h | 16 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 110 |
2 files changed, 68 insertions, 58 deletions
diff --git a/include/private/SkFixed.h b/include/private/SkFixed.h index 526b7d4509..b2eea5f624 100644 --- a/include/private/SkFixed.h +++ b/include/private/SkFixed.h @@ -102,9 +102,19 @@ inline SkFixed SkFixedMul_longlong(SkFixed a, SkFixed b) { */ SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x) { - int32_t y; - asm("vcvt.s32.f32 %0, %0, #16": "+t"(x)); - memcpy(&y, &x, sizeof(y)); + int32_t y, z; + asm("movs %1, %3, lsl #1 \n" + "mov %2, #0x8E \n" + "sub %1, %2, %1, lsr #24 \n" + "mov %2, %3, lsl #8 \n" + "orr %2, %2, #0x80000000 \n" + "mov %1, %2, lsr %1 \n" + "it cs \n" + "rsbcs %1, %1, #0 \n" + : "=r"(x), "=&r"(y), "=&r"(z) + : "r"(x) + : "cc" + ); return y; } inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y) diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index d30d14fb77..7998a89517 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -194,85 +194,85 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, SkASSERT(255 == alpha); if (count >= 8) { - int32_t tmp = 0; + uint16_t* SK_RESTRICT keep_dst = 0; asm volatile ( - "ands %[tmp], %[count], #7 \n\t" - "vmov.u8 d31, #1<<7 \n\t" - "vld1.16 {q12}, [%[dst]] \n\t" - "vld4.8 {d0-d3}, [%[src]] \n\t" + "ands ip, %[count], #7 \n\t" + "vmov.u8 d31, #1<<7 \n\t" + "vld1.16 {q12}, [%[dst]] \n\t" + "vld4.8 {d0-d3}, [%[src]] \n\t" // Thumb does not support the standard ARM conditional // instructions but instead requires the 'it' instruction // to signal conditional execution - "it eq \n\t" - "moveq %[tmp], #8 \n\t" - "mov ip, %[dst] \n\t" - - "add %[src], %[src], %[tmp], LSL#2 \n\t" - "add %[dst], %[dst], %[tmp], LSL#1 \n\t" - "subs %[count], %[count], %[tmp] \n\t" - "b 9f \n\t" + "it eq \n\t" + "moveq ip, #8 \n\t" + "mov %[keep_dst], %[dst] \n\t" + + "add %[src], %[src], ip, LSL#2 \n\t" + "add %[dst], %[dst], ip, LSL#1 \n\t" + "subs %[count], %[count], ip \n\t" + "b 9f \n\t" // LOOP "2: \n\t" - "vld1.16 {q12}, [%[dst]]! \n\t" - "vld4.8 {d0-d3}, [%[src]]! \n\t" - "vst1.16 {q10}, [ip] \n\t" - "sub ip, %[dst], #8*2 \n\t" - "subs %[count], %[count], #8 \n\t" + "vld1.16 {q12}, [%[dst]]! \n\t" + "vld4.8 {d0-d3}, [%[src]]! \n\t" + "vst1.16 {q10}, [%[keep_dst]] \n\t" + "sub %[keep_dst], %[dst], #8*2 \n\t" + "subs %[count], %[count], #8 \n\t" "9: \n\t" - "pld [%[dst],#32] \n\t" + "pld [%[dst],#32] \n\t" // expand 0565 q12 to 8888 {d4-d7} - "vmovn.u16 d4, q12 \n\t" - "vshr.u16 q11, q12, #5 \n\t" - "vshr.u16 q10, q12, #6+5 \n\t" - "vmovn.u16 d5, q11 \n\t" - "vmovn.u16 d6, q10 \n\t" - "vshl.u8 d4, d4, #3 \n\t" - "vshl.u8 d5, d5, #2 \n\t" - "vshl.u8 d6, d6, #3 \n\t" - - "vmovl.u8 q14, d31 \n\t" - "vmovl.u8 q13, d31 \n\t" - "vmovl.u8 q12, d31 \n\t" + "vmovn.u16 d4, q12 \n\t" + "vshr.u16 q11, q12, #5 \n\t" + "vshr.u16 q10, q12, #6+5 \n\t" + "vmovn.u16 d5, q11 \n\t" + "vmovn.u16 d6, q10 \n\t" + "vshl.u8 d4, d4, #3 \n\t" + "vshl.u8 d5, d5, #2 \n\t" + "vshl.u8 d6, d6, #3 \n\t" + + "vmovl.u8 q14, d31 \n\t" + "vmovl.u8 q13, d31 \n\t" + "vmovl.u8 q12, d31 \n\t" // duplicate in 4/2/1 & 8pix vsns - "vmvn.8 d30, d3 \n\t" - "vmlal.u8 q14, d30, d6 \n\t" - "vmlal.u8 q13, d30, d5 \n\t" - "vmlal.u8 q12, d30, d4 \n\t" - "vshr.u16 q8, q14, #5 \n\t" - "vshr.u16 q9, q13, #6 \n\t" - "vaddhn.u16 d6, q14, q8 \n\t" - "vshr.u16 q8, q12, #5 \n\t" - "vaddhn.u16 d5, q13, q9 \n\t" - "vaddhn.u16 d4, q12, q8 \n\t" + "vmvn.8 d30, d3 \n\t" + "vmlal.u8 q14, d30, d6 \n\t" + "vmlal.u8 q13, d30, d5 \n\t" + "vmlal.u8 q12, d30, d4 \n\t" + "vshr.u16 q8, q14, #5 \n\t" + "vshr.u16 q9, q13, #6 \n\t" + "vaddhn.u16 d6, q14, q8 \n\t" + "vshr.u16 q8, q12, #5 \n\t" + "vaddhn.u16 d5, q13, q9 \n\t" + "vaddhn.u16 d4, q12, q8 \n\t" // intentionally don't calculate alpha // result in d4-d6 #ifdef SK_PMCOLOR_IS_RGBA - "vqadd.u8 d6, d6, d0 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d2 \n\t" + "vqadd.u8 d6, d6, d0 \n\t" + "vqadd.u8 d5, d5, d1 \n\t" + "vqadd.u8 d4, d4, d2 \n\t" #else - "vqadd.u8 d6, d6, d2 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d0 \n\t" + "vqadd.u8 d6, d6, d2 \n\t" + "vqadd.u8 d5, d5, d1 \n\t" + "vqadd.u8 d4, d4, d0 \n\t" #endif // pack 8888 {d4-d6} to 0565 q10 - "vshll.u8 q10, d6, #8 \n\t" - "vshll.u8 q3, d5, #8 \n\t" - "vshll.u8 q2, d4, #8 \n\t" - "vsri.u16 q10, q3, #5 \n\t" - "vsri.u16 q10, q2, #11 \n\t" + "vshll.u8 q10, d6, #8 \n\t" + "vshll.u8 q3, d5, #8 \n\t" + "vshll.u8 q2, d4, #8 \n\t" + "vsri.u16 q10, q3, #5 \n\t" + "vsri.u16 q10, q2, #11 \n\t" - "bne 2b \n\t" + "bne 2b \n\t" "1: \n\t" - "vst1.16 {q10}, [ip] \n\t" + "vst1.16 {q10}, [%[keep_dst]] \n\t" : [count] "+r" (count) - : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp) + : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", "d30","d31" |