aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/build_stages.py
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-08-02 14:25:55 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-08-03 01:54:58 +0000
commit255607f094dac0f1c075a5b718005e7bd4d03e3d (patch)
tree6ec199fa3aa57c136f0a3550ece3098635a94097 /src/jumper/build_stages.py
parent20963ff2867fd6f0e2982bdfba26707a33349601 (diff)
8-bit hacking
I think we can replace a lot of legacy code with an SkRasterPipeline backend that works in 8-bit and stays interlaced. Think of this as a "lowerp" replacement for lowp. I'm having some trouble getting ARMv8 working. ARMv7 should be fine, but I want to turn it on separately from x86. I haven't looked at 32-bit x86 yet, but that's also on the todo list. Open questions to follow up on: - is it better to fold every multiply back down to 8-bit (as seen here), or to allow intermediates to accumulate in 16-bit and divide by 255 when done/needed? - is it better pass tightly packed 8-bit vectors between stages (as seen here), or to keep the 8-bit values unpacked in 16-bit lanes? - should we make V wider than 1 register? GMs look good. All diffs invisible and plausibly due to the 15->8 bit precision drop. A quick bench run showed this running in about 0.75x the time of the existing lowp backend. Change-Id: I24aa46ff1d19c0b9b8dc192d5b1821cab0b8843c Reviewed-on: https://skia-review.googlesource.com/29886 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
Diffstat (limited to 'src/jumper/build_stages.py')
-rwxr-xr-xsrc/jumper/build_stages.py64
1 files changed, 56 insertions, 8 deletions
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 699632f2d9..bf724ce0ac 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -14,6 +14,7 @@ objdump = 'gobjdump'
ccache = 'ccache'
stages = 'src/jumper/SkJumper_stages.cpp'
stages_lowp = 'src/jumper/SkJumper_stages_lowp.cpp'
+stages_8bit = 'src/jumper/SkJumper_stages_8bit.cpp'
generated = 'src/jumper/SkJumper_generated.S'
generated_win = 'src/jumper/SkJumper_generated_win.S'
@@ -22,8 +23,9 @@ objdump = sys.argv[2] if len(sys.argv) > 2 else objdump
ccache = sys.argv[3] if len(sys.argv) > 3 else ccache
stages = sys.argv[4] if len(sys.argv) > 4 else stages
stages_lowp = sys.argv[5] if len(sys.argv) > 5 else stages_lowp
-generated = sys.argv[6] if len(sys.argv) > 6 else generated
-generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
+stages_8bit = sys.argv[6] if len(sys.argv) > 6 else stages_8bit
+generated = sys.argv[7] if len(sys.argv) > 7 else generated
+generated_win = sys.argv[8] if len(sys.argv) > 8 else generated_win
clang = [ccache, clang, '-x', 'c++']
@@ -48,6 +50,13 @@ subprocess.check_call(clang + cflags + sse2 + win + x86 +
['-c', stages] +
['-o', 'win_x86_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 +
+ ['-c', stages_8bit] +
+ ['-o', '8bit_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win +
+ ['-c', stages_8bit] +
+ ['-o', 'win_8bit_sse2.o'])
+
ssse3 = ['-mssse3', '-mno-sse4.1']
subprocess.check_call(clang + cflags + ssse3 +
['-c', stages_lowp] +
@@ -64,6 +73,13 @@ subprocess.check_call(clang + cflags + sse41 + win +
['-c', stages] +
['-o', 'win_sse41.o'])
+subprocess.check_call(clang + cflags + sse41 +
+ ['-c', stages_8bit] +
+ ['-o', '8bit_sse41.o'])
+subprocess.check_call(clang + cflags + sse41 + win +
+ ['-c', stages_8bit] +
+ ['-o', 'win_8bit_sse41.o'])
+
avx = ['-mavx']
subprocess.check_call(clang + cflags + avx +
['-c', stages] +
@@ -86,12 +102,23 @@ subprocess.check_call(clang + cflags + hsw + win +
['-c', stages_lowp] +
['-o', 'win_lowp_hsw.o'])
+subprocess.check_call(clang + cflags + hsw +
+ ['-c', stages_8bit] +
+ ['-o', '8bit_hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+ ['-c', stages_8bit] +
+ ['-o', 'win_8bit_hsw.o'])
+
# iOS disallows the use of register x18,
# so we need to use it as a least-common denominator.
aarch64 = [ '--target=arm64-apple-ios' ]
subprocess.check_call(clang + cflags + aarch64 +
['-c', stages] +
['-o', 'aarch64.o'])
+# TODO: need to work out relocations (adrp, lCPI, etc.)
+#subprocess.check_call(clang + cflags + aarch64 +
+# ['-c', stages_8bit] +
+# ['-o', '8bit_aarch64.o'])
vfp4 = [
'--target=armv7a-linux-gnueabihf',
@@ -100,6 +127,10 @@ vfp4 = [
subprocess.check_call(clang + cflags + vfp4 +
['-c', stages] +
['-o', 'vfp4.o'])
+# TODO: should work fine... I just want to turn this one on separately from x86
+#subprocess.check_call(clang + cflags + vfp4 +
+# ['-c', stages_8bit] +
+# ['-o', '8bit_vfp4.o'])
def parse_object_file(dot_o, directive, target=None):
globl, hidden, label, comment, align = \
@@ -130,6 +161,7 @@ def parse_object_file(dot_o, directive, target=None):
'--insn-width=11',
'-j', '.text',
'-j', '.literal4',
+ '-j', '.literal8',
'-j', '.literal16',
'-j', '.const',
dot_o]
@@ -193,6 +225,7 @@ print '#if defined(__MACH__)'
print ' #define HIDDEN .private_extern'
print ' #define FUNCTION(name)'
print ' #define BALIGN4 .align 2'
+print ' #define BALIGN8 .align 3'
print ' #define BALIGN16 .align 4'
print ' #define BALIGN32 .align 5'
print '#else'
@@ -200,6 +233,7 @@ print ' .section .note.GNU-stack,"",%progbits'
print ' #define HIDDEN .hidden'
print ' #define FUNCTION(name) .type name,%function'
print ' #define BALIGN4 .balign 4'
+print ' #define BALIGN8 .balign 8'
print ' #define BALIGN16 .balign 16'
print ' #define BALIGN32 .balign 32'
print '#endif'
@@ -207,11 +241,13 @@ print '#endif'
print '.text'
print '#if defined(__aarch64__)'
print 'BALIGN4'
-parse_object_file('aarch64.o', '.long')
+parse_object_file( 'aarch64.o', '.long')
+#parse_object_file('8bit_aarch64.o', '.long')
print '#elif defined(__arm__)'
print 'BALIGN4'
-parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file( 'vfp4.o', '.long', target='elf32-littlearm')
+#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm')
print '#elif defined(__x86_64__)'
print 'BALIGN32'
@@ -223,9 +259,15 @@ parse_object_file('sse41.o', '.byte')
print 'BALIGN32'
parse_object_file('sse2.o', '.byte')
print 'BALIGN32'
-parse_object_file('lowp_hsw.o', '.byte')
+parse_object_file('lowp_hsw.o', '.byte')
+print 'BALIGN32'
+parse_object_file('lowp_ssse3.o', '.byte')
+print 'BALIGN32'
+parse_object_file('8bit_hsw.o', '.byte')
+print 'BALIGN32'
+parse_object_file('8bit_sse41.o', '.byte')
print 'BALIGN32'
-parse_object_file('lowp_ssse3.o', '.byte')
+parse_object_file('8bit_sse2.o', '.byte')
print '#elif defined(__i386__)'
print 'BALIGN32'
@@ -253,9 +295,15 @@ parse_object_file('win_sse41.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_sse2.o', 'DB')
print 'ALIGN 32'
-parse_object_file('win_lowp_hsw.o', 'DB')
+parse_object_file('win_lowp_hsw.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_lowp_ssse3.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_8bit_hsw.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_8bit_sse41.o', 'DB')
print 'ALIGN 32'
-parse_object_file('win_lowp_ssse3.o', 'DB')
+parse_object_file('win_8bit_sse2.o', 'DB')
print 'ELSE'
print '.MODEL FLAT,C'