aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-03-02 09:19:25 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-03-02 15:12:44 +0000
commit9c220e00896e8d344e9cfde5934b00fb011d14b8 (patch)
treefb1c82126e05324a19d697ed46875aed661bb685 /src
parent1429550f5d9b6df0693b7cc10de4258982b6251d (diff)
SkJumper: allow the compiler to generate FMAs
Today we use mad() to get FMAs where possible. -ffp-contract=fast lets the compiler generate them if it spots an opportunity. It looks like it's found a mix of FMAs and FMSs. I will follow up by seeing if we can relax the use of mad(). Quick experiments say no, but less quick experiments may say otherwise. Change-Id: I5228811cfbf11cccc0d715672a464fd1e1cea3b0 Reviewed-on: https://skia-review.googlesource.com/9136 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_generated.S52
-rw-r--r--src/jumper/SkJumper_generated_win.S28
-rwxr-xr-xsrc/jumper/build_stages.py3
3 files changed, 36 insertions, 47 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 06a5e5edf8..750f5a046d 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -726,8 +726,7 @@ _sk_repeat_x_aarch64:
.long 0x4eb18651 // add v17.4s, v18.4s, v17.4s
.long 0x6e32fc12 // fdiv v18.4s, v0.4s, v18.4s
.long 0x4e219a52 // frintm v18.4s, v18.4s
- .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0]
- .long 0x4eb0d400 // fsub v0.4s, v0.4s, v16.4s
+ .long 0x4f905240 // fmls v0.4s, v18.4s, v16.s[0]
.long 0x4eb1f400 // fmin v0.4s, v0.4s, v17.4s
.long 0xd61f0060 // br x3
@@ -740,8 +739,7 @@ _sk_repeat_y_aarch64:
.long 0x4eb18651 // add v17.4s, v18.4s, v17.4s
.long 0x6e32fc32 // fdiv v18.4s, v1.4s, v18.4s
.long 0x4e219a52 // frintm v18.4s, v18.4s
- .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0]
- .long 0x4eb0d421 // fsub v1.4s, v1.4s, v16.4s
+ .long 0x4f905241 // fmls v1.4s, v18.4s, v16.s[0]
.long 0x4eb1f421 // fmin v1.4s, v1.4s, v17.4s
.long 0xd61f0060 // br x3
@@ -755,13 +753,12 @@ _sk_mirror_x_aarch64:
.long 0x4e040612 // dup v18.4s, v16.s[0]
.long 0x6e32fc12 // fdiv v18.4s, v0.4s, v18.4s
.long 0x4e219a52 // frintm v18.4s, v18.4s
- .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0]
- .long 0x4eb0d400 // fsub v0.4s, v0.4s, v16.4s
- .long 0x6f07e7f2 // movi v18.2d, #0xffffffffffffffff
+ .long 0x4f905240 // fmls v0.4s, v18.4s, v16.s[0]
+ .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
.long 0x4eb1d400 // fsub v0.4s, v0.4s, v17.4s
- .long 0x4eb28632 // add v18.4s, v17.4s, v18.4s
+ .long 0x4eb08630 // add v16.4s, v17.4s, v16.4s
.long 0x4ea0f800 // fabs v0.4s, v0.4s
- .long 0x4eb2f400 // fmin v0.4s, v0.4s, v18.4s
+ .long 0x4eb0f400 // fmin v0.4s, v0.4s, v16.4s
.long 0xd61f0060 // br x3
.globl _sk_mirror_y_aarch64
@@ -774,13 +771,12 @@ _sk_mirror_y_aarch64:
.long 0x4e040612 // dup v18.4s, v16.s[0]
.long 0x6e32fc32 // fdiv v18.4s, v1.4s, v18.4s
.long 0x4e219a52 // frintm v18.4s, v18.4s
- .long 0x4f909250 // fmul v16.4s, v18.4s, v16.s[0]
- .long 0x4eb0d421 // fsub v1.4s, v1.4s, v16.4s
- .long 0x6f07e7f2 // movi v18.2d, #0xffffffffffffffff
+ .long 0x4f905241 // fmls v1.4s, v18.4s, v16.s[0]
+ .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
.long 0x4eb1d421 // fsub v1.4s, v1.4s, v17.4s
- .long 0x4eb28632 // add v18.4s, v17.4s, v18.4s
+ .long 0x4eb08630 // add v16.4s, v17.4s, v16.4s
.long 0x4ea0f821 // fabs v1.4s, v1.4s
- .long 0x4eb2f421 // fmin v1.4s, v1.4s, v18.4s
+ .long 0x4eb0f421 // fmin v1.4s, v1.4s, v16.4s
.long 0xd61f0060 // br x3
.globl _sk_matrix_2x3_aarch64
@@ -2479,11 +2475,10 @@ _sk_repeat_x_hsw:
.byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
.byte 196,65,124,94,200 // vdivps %ymm8,%ymm0,%ymm9
.byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
- .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
- .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0
- .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
- .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
- .byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0
+ .byte 196,98,61,172,200 // vfnmadd213ps %ymm0,%ymm8,%ymm9
+ .byte 197,253,118,192 // vpcmpeqd %ymm0,%ymm0,%ymm0
+ .byte 197,189,254,192 // vpaddd %ymm0,%ymm8,%ymm0
+ .byte 197,180,93,192 // vminps %ymm0,%ymm9,%ymm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -2493,11 +2488,10 @@ _sk_repeat_y_hsw:
.byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
.byte 196,65,116,94,200 // vdivps %ymm8,%ymm1,%ymm9
.byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
- .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
- .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1
- .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
- .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
- .byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1
+ .byte 196,98,61,172,201 // vfnmadd213ps %ymm1,%ymm8,%ymm9
+ .byte 197,245,118,201 // vpcmpeqd %ymm1,%ymm1,%ymm1
+ .byte 197,189,254,201 // vpaddd %ymm1,%ymm8,%ymm1
+ .byte 197,180,93,201 // vminps %ymm1,%ymm9,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -2511,9 +2505,8 @@ _sk_mirror_x_hsw:
.byte 196,226,125,24,192 // vbroadcastss %xmm0,%ymm0
.byte 197,44,94,192 // vdivps %ymm0,%ymm10,%ymm8
.byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
- .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0
- .byte 197,172,92,192 // vsubps %ymm0,%ymm10,%ymm0
- .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0
+ .byte 196,66,125,172,194 // vfnmadd213ps %ymm10,%ymm0,%ymm8
+ .byte 196,193,60,92,193 // vsubps %ymm9,%ymm8,%ymm0
.byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
.byte 197,60,92,192 // vsubps %ymm0,%ymm8,%ymm8
.byte 197,188,84,192 // vandps %ymm0,%ymm8,%ymm0
@@ -2533,9 +2526,8 @@ _sk_mirror_y_hsw:
.byte 196,226,125,24,201 // vbroadcastss %xmm1,%ymm1
.byte 197,44,94,193 // vdivps %ymm1,%ymm10,%ymm8
.byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
- .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1
- .byte 197,172,92,201 // vsubps %ymm1,%ymm10,%ymm1
- .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1
+ .byte 196,66,117,172,194 // vfnmadd213ps %ymm10,%ymm1,%ymm8
+ .byte 196,193,60,92,201 // vsubps %ymm9,%ymm8,%ymm1
.byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
.byte 197,60,92,193 // vsubps %ymm1,%ymm8,%ymm8
.byte 197,188,84,201 // vandps %ymm1,%ymm8,%ymm1
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 1cf5e7eb63..4ec2b01b48 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -657,11 +657,10 @@ _sk_repeat_x_hsw LABEL PROC
DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
DB 196,65,124,94,200 ; vdivps %ymm8,%ymm0,%ymm9
DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
- DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
- DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0
- DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
- DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
- DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
+ DB 196,98,61,172,200 ; vfnmadd213ps %ymm0,%ymm8,%ymm9
+ DB 197,253,118,192 ; vpcmpeqd %ymm0,%ymm0,%ymm0
+ DB 197,189,254,192 ; vpaddd %ymm0,%ymm8,%ymm0
+ DB 197,180,93,192 ; vminps %ymm0,%ymm9,%ymm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -671,11 +670,10 @@ _sk_repeat_y_hsw LABEL PROC
DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
DB 196,65,116,94,200 ; vdivps %ymm8,%ymm1,%ymm9
DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
- DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
- DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1
- DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
- DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
- DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 196,98,61,172,201 ; vfnmadd213ps %ymm1,%ymm8,%ymm9
+ DB 197,245,118,201 ; vpcmpeqd %ymm1,%ymm1,%ymm1
+ DB 197,189,254,201 ; vpaddd %ymm1,%ymm8,%ymm1
+ DB 197,180,93,201 ; vminps %ymm1,%ymm9,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -689,9 +687,8 @@ _sk_mirror_x_hsw LABEL PROC
DB 196,226,125,24,192 ; vbroadcastss %xmm0,%ymm0
DB 197,44,94,192 ; vdivps %ymm0,%ymm10,%ymm8
DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
- DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
- DB 197,172,92,192 ; vsubps %ymm0,%ymm10,%ymm0
- DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0
+ DB 196,66,125,172,194 ; vfnmadd213ps %ymm10,%ymm0,%ymm8
+ DB 196,193,60,92,193 ; vsubps %ymm9,%ymm8,%ymm0
DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
DB 197,60,92,192 ; vsubps %ymm0,%ymm8,%ymm8
DB 197,188,84,192 ; vandps %ymm0,%ymm8,%ymm0
@@ -711,9 +708,8 @@ _sk_mirror_y_hsw LABEL PROC
DB 196,226,125,24,201 ; vbroadcastss %xmm1,%ymm1
DB 197,44,94,193 ; vdivps %ymm1,%ymm10,%ymm8
DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
- DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
- DB 197,172,92,201 ; vsubps %ymm1,%ymm10,%ymm1
- DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1
+ DB 196,66,117,172,194 ; vfnmadd213ps %ymm10,%ymm1,%ymm8
+ DB 196,193,60,92,201 ; vsubps %ymm9,%ymm8,%ymm1
DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
DB 197,60,92,193 ; vsubps %ymm1,%ymm8,%ymm8
DB 197,188,84,201 ; vandps %ymm1,%ymm8,%ymm1
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 4ca04d15b3..27d3a41d83 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -18,7 +18,8 @@ objdump = 'gobjdump'
#ndk = '/home/mtklein/ndk/'
#objdump = '/home/mtklein/binutils-2.27/binutils/objdump'
-cflags = '-std=c++11 -Os -fomit-frame-pointer -DJUMPER'.split()
+cflags = ['-std=c++11', '-Os', '-DJUMPER',
+ '-fomit-frame-pointer', '-ffp-contract=fast' ]
sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
subprocess.check_call(clang + cflags + sse2 +