diff options
author | Mike Klein <mtklein@chromium.org> | 2017-03-02 09:19:25 -0500 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-03-02 15:12:44 +0000 |
commit | 9c220e00896e8d344e9cfde5934b00fb011d14b8 (patch) | |
tree | fb1c82126e05324a19d697ed46875aed661bb685 | |
parent | 1429550f5d9b6df0693b7cc10de4258982b6251d (diff) |
SkJumper: allow the compiler to generate FMAs
Today we use mad() to get FMAs where possible.
-ffp-contract=fast lets the compiler generate them if it spots an opportunity.
It looks like it's found a mix of FMAs and FMSs.
I will follow up by seeing if we can relax the use of mad().
Quick experiments say no, but less quick experiments may say otherwise.
Change-Id: I5228811cfbf11cccc0d715672a464fd1e1cea3b0
Reviewed-on: https://skia-review.googlesource.com/9136
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
-rw-r--r-- | src/jumper/SkJumper_generated.S | 52 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 28 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 3 |
3 files changed, 36 insertions, 47 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 06a5e5edf8..750f5a046d 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -726,8 +726,7 @@ _sk_repeat_x_aarch64: .long 0x4eb18651 // add v17.4s, v18.4s, v17.4s .long 0x6e32fc12 // fdiv v18.4s, v0.4s, v18.4s .long 0x4e219a52 // frintm v18.4s, v18.4s - .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0] - .long 0x4eb0d400 // fsub v0.4s, v0.4s, v16.4s + .long 0x4f905240 // fmls v0.4s, v18.4s, v16.s[0] .long 0x4eb1f400 // fmin v0.4s, v0.4s, v17.4s .long 0xd61f0060 // br x3 @@ -740,8 +739,7 @@ _sk_repeat_y_aarch64: .long 0x4eb18651 // add v17.4s, v18.4s, v17.4s .long 0x6e32fc32 // fdiv v18.4s, v1.4s, v18.4s .long 0x4e219a52 // frintm v18.4s, v18.4s - .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0] - .long 0x4eb0d421 // fsub v1.4s, v1.4s, v16.4s + .long 0x4f905241 // fmls v1.4s, v18.4s, v16.s[0] .long 0x4eb1f421 // fmin v1.4s, v1.4s, v17.4s .long 0xd61f0060 // br x3 @@ -755,13 +753,12 @@ _sk_mirror_x_aarch64: .long 0x4e040612 // dup v18.4s, v16.s[0] .long 0x6e32fc12 // fdiv v18.4s, v0.4s, v18.4s .long 0x4e219a52 // frintm v18.4s, v18.4s - .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0] - .long 0x4eb0d400 // fsub v0.4s, v0.4s, v16.4s - .long 0x6f07e7f2 // movi v18.2d, #0xffffffffffffffff + .long 0x4f905240 // fmls v0.4s, v18.4s, v16.s[0] + .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff .long 0x4eb1d400 // fsub v0.4s, v0.4s, v17.4s - .long 0x4eb28632 // add v18.4s, v17.4s, v18.4s + .long 0x4eb08630 // add v16.4s, v17.4s, v16.4s .long 0x4ea0f800 // fabs v0.4s, v0.4s - .long 0x4eb2f400 // fmin v0.4s, v0.4s, v18.4s + .long 0x4eb0f400 // fmin v0.4s, v0.4s, v16.4s .long 0xd61f0060 // br x3 .globl _sk_mirror_y_aarch64 @@ -774,13 +771,12 @@ _sk_mirror_y_aarch64: .long 0x4e040612 // dup v18.4s, v16.s[0] .long 0x6e32fc32 // fdiv v18.4s, v1.4s, v18.4s .long 0x4e219a52 // frintm v18.4s, v18.4s - .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0] - .long 0x4eb0d421 // fsub v1.4s, v1.4s, v16.4s - .long 0x6f07e7f2 // movi v18.2d, #0xffffffffffffffff + .long 0x4f905241 // fmls v1.4s, v18.4s, v16.s[0] + .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff .long 0x4eb1d421 // fsub v1.4s, v1.4s, v17.4s - .long 0x4eb28632 // add v18.4s, v17.4s, v18.4s + .long 0x4eb08630 // add v16.4s, v17.4s, v16.4s .long 0x4ea0f821 // fabs v1.4s, v1.4s - .long 0x4eb2f421 // fmin v1.4s, v1.4s, v18.4s + .long 0x4eb0f421 // fmin v1.4s, v1.4s, v16.4s .long 0xd61f0060 // br x3 .globl _sk_matrix_2x3_aarch64 @@ -2479,11 +2475,10 @@ _sk_repeat_x_hsw: .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 .byte 196,65,124,94,200 // vdivps %ymm8,%ymm0,%ymm9 .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9 - .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9 - .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0 - .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9 - .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8 - .byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0 + .byte 196,98,61,172,200 // vfnmadd213ps %ymm0,%ymm8,%ymm9 + .byte 197,253,118,192 // vpcmpeqd %ymm0,%ymm0,%ymm0 + .byte 197,189,254,192 // vpaddd %ymm0,%ymm8,%ymm0 + .byte 197,180,93,192 // vminps %ymm0,%ymm9,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -2493,11 +2488,10 @@ _sk_repeat_y_hsw: .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 .byte 196,65,116,94,200 // vdivps %ymm8,%ymm1,%ymm9 .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9 - .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9 - .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1 - .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9 - .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8 - .byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1 + .byte 196,98,61,172,201 // vfnmadd213ps %ymm1,%ymm8,%ymm9 + .byte 197,245,118,201 // vpcmpeqd %ymm1,%ymm1,%ymm1 + .byte 197,189,254,201 // vpaddd %ymm1,%ymm8,%ymm1 + .byte 197,180,93,201 // vminps %ymm1,%ymm9,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -2511,9 +2505,8 @@ _sk_mirror_x_hsw: .byte 196,226,125,24,192 // vbroadcastss %xmm0,%ymm0 .byte 197,44,94,192 // vdivps %ymm0,%ymm10,%ymm8 .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8 - .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0 - .byte 197,172,92,192 // vsubps %ymm0,%ymm10,%ymm0 - .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0 + .byte 196,66,125,172,194 // vfnmadd213ps %ymm10,%ymm0,%ymm8 + .byte 196,193,60,92,193 // vsubps %ymm9,%ymm8,%ymm0 .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8 .byte 197,60,92,192 // vsubps %ymm0,%ymm8,%ymm8 .byte 197,188,84,192 // vandps %ymm0,%ymm8,%ymm0 @@ -2533,9 +2526,8 @@ _sk_mirror_y_hsw: .byte 196,226,125,24,201 // vbroadcastss %xmm1,%ymm1 .byte 197,44,94,193 // vdivps %ymm1,%ymm10,%ymm8 .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8 - .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1 - .byte 197,172,92,201 // vsubps %ymm1,%ymm10,%ymm1 - .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1 + .byte 196,66,117,172,194 // vfnmadd213ps %ymm10,%ymm1,%ymm8 + .byte 196,193,60,92,201 // vsubps %ymm9,%ymm8,%ymm1 .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8 .byte 197,60,92,193 // vsubps %ymm1,%ymm8,%ymm8 .byte 197,188,84,201 // vandps %ymm1,%ymm8,%ymm1 diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 1cf5e7eb63..4ec2b01b48 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -657,11 +657,10 @@ _sk_repeat_x_hsw LABEL PROC DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 DB 196,65,124,94,200 ; vdivps %ymm8,%ymm0,%ymm9 DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9 - DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9 - DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0 - DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9 - DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8 - DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0 + DB 196,98,61,172,200 ; vfnmadd213ps %ymm0,%ymm8,%ymm9 + DB 197,253,118,192 ; vpcmpeqd %ymm0,%ymm0,%ymm0 + DB 197,189,254,192 ; vpaddd %ymm0,%ymm8,%ymm0 + DB 197,180,93,192 ; vminps %ymm0,%ymm9,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -671,11 +670,10 @@ _sk_repeat_y_hsw LABEL PROC DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 DB 196,65,116,94,200 ; vdivps %ymm8,%ymm1,%ymm9 DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9 - DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9 - DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1 - DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9 - DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8 - DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1 + DB 196,98,61,172,201 ; vfnmadd213ps %ymm1,%ymm8,%ymm9 + DB 197,245,118,201 ; vpcmpeqd %ymm1,%ymm1,%ymm1 + DB 197,189,254,201 ; vpaddd %ymm1,%ymm8,%ymm1 + DB 197,180,93,201 ; vminps %ymm1,%ymm9,%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -689,9 +687,8 @@ _sk_mirror_x_hsw LABEL PROC DB 196,226,125,24,192 ; vbroadcastss %xmm0,%ymm0 DB 197,44,94,192 ; vdivps %ymm0,%ymm10,%ymm8 DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8 - DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 - DB 197,172,92,192 ; vsubps %ymm0,%ymm10,%ymm0 - DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0 + DB 196,66,125,172,194 ; vfnmadd213ps %ymm10,%ymm0,%ymm8 + DB 196,193,60,92,193 ; vsubps %ymm9,%ymm8,%ymm0 DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 DB 197,60,92,192 ; vsubps %ymm0,%ymm8,%ymm8 DB 197,188,84,192 ; vandps %ymm0,%ymm8,%ymm0 @@ -711,9 +708,8 @@ _sk_mirror_y_hsw LABEL PROC DB 196,226,125,24,201 ; vbroadcastss %xmm1,%ymm1 DB 197,44,94,193 ; vdivps %ymm1,%ymm10,%ymm8 DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8 - DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 - DB 197,172,92,201 ; vsubps %ymm1,%ymm10,%ymm1 - DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1 + DB 196,66,117,172,194 ; vfnmadd213ps %ymm10,%ymm1,%ymm8 + DB 196,193,60,92,201 ; vsubps %ymm9,%ymm8,%ymm1 DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 DB 197,60,92,193 ; vsubps %ymm1,%ymm8,%ymm8 DB 197,188,84,201 ; vandps %ymm1,%ymm8,%ymm1 diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index 4ca04d15b3..27d3a41d83 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -18,7 +18,8 @@ objdump = 'gobjdump' #ndk = '/home/mtklein/ndk/' #objdump = '/home/mtklein/binutils-2.27/binutils/objdump' -cflags = '-std=c++11 -Os -fomit-frame-pointer -DJUMPER'.split() +cflags = ['-std=c++11', '-Os', '-DJUMPER', + '-fomit-frame-pointer', '-ffp-contract=fast' ] sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split() subprocess.check_call(clang + cflags + sse2 + |