8-bit hacking

I think we can replace a lot of legacy code with an SkRasterPipeline backend that works in 8-bit and stays interlaced. Think of this as a "lowerp" replacement for lowp. I'm having some trouble getting ARMv8 working. ARMv7 should be fine, but I want to turn it on separately from x86. I haven't looked at 32-bit x86 yet, but that's also on the todo list. Open questions to follow up on: - is it better to fold every multiply back down to 8-bit (as seen here), or to allow intermediates to accumulate in 16-bit and divide by 255 when done/needed? - is it better pass tightly packed 8-bit vectors between stages (as seen here), or to keep the 8-bit values unpacked in 16-bit lanes? - should we make V wider than 1 register? GMs look good. All diffs invisible and plausibly due to the 15->8 bit precision drop. A quick bench run showed this running in about 0.75x the time of the existing lowp backend. Change-Id: I24aa46ff1d19c0b9b8dc192d5b1821cab0b8843c Reviewed-on: https://skia-review.googlesource.com/29886 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-08-02 14:25:55 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-08-03 01:54:58 +0000
commit: 255607f094dac0f1c075a5b718005e7bd4d03e3d (patch)
tree: 6ec199fa3aa57c136f0a3550ece3098635a94097 /src/jumper/build_stages.py
parent: 20963ff2867fd6f0e2982bdfba26707a33349601 (diff)
1 files changed, 56 insertions, 8 deletions
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 699632f2d9..bf724ce0ac 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -14,6 +14,7 @@ objdump       = 'gobjdump'
 ccache        = 'ccache'
 stages        = 'src/jumper/SkJumper_stages.cpp'
 stages_lowp   = 'src/jumper/SkJumper_stages_lowp.cpp'
+stages_8bit   = 'src/jumper/SkJumper_stages_8bit.cpp'
 generated     = 'src/jumper/SkJumper_generated.S'
 generated_win = 'src/jumper/SkJumper_generated_win.S'
 
@@ -22,8 +23,9 @@ objdump       = sys.argv[2] if len(sys.argv) > 2 else objdump
 ccache        = sys.argv[3] if len(sys.argv) > 3 else ccache
 stages        = sys.argv[4] if len(sys.argv) > 4 else stages
 stages_lowp   = sys.argv[5] if len(sys.argv) > 5 else stages_lowp
-generated     = sys.argv[6] if len(sys.argv) > 6 else generated
-generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
+stages_8bit   = sys.argv[6] if len(sys.argv) > 6 else stages_8bit
+generated     = sys.argv[7] if len(sys.argv) > 7 else generated
+generated_win = sys.argv[8] if len(sys.argv) > 8 else generated_win
 
 clang = [ccache, clang, '-x', 'c++']
 
@@ -48,6 +50,13 @@ subprocess.check_call(clang + cflags + sse2 + win + x86 +
                       ['-c', stages] +
                       ['-o', 'win_x86_sse2.o'])
 
+subprocess.check_call(clang + cflags + sse2 +
+                      ['-c', stages_8bit] +
+                      ['-o', '8bit_sse2.o'])
+subprocess.check_call(clang + cflags + sse2 + win +
+                      ['-c', stages_8bit] +
+                      ['-o', 'win_8bit_sse2.o'])
+
 ssse3 = ['-mssse3', '-mno-sse4.1']
 subprocess.check_call(clang + cflags + ssse3 +
                       ['-c', stages_lowp] +
@@ -64,6 +73,13 @@ subprocess.check_call(clang + cflags + sse41 + win +
                       ['-c', stages] +
                       ['-o', 'win_sse41.o'])
 
+subprocess.check_call(clang + cflags + sse41 +
+                      ['-c', stages_8bit] +
+                      ['-o', '8bit_sse41.o'])
+subprocess.check_call(clang + cflags + sse41 + win +
+                      ['-c', stages_8bit] +
+                      ['-o', 'win_8bit_sse41.o'])
+
 avx = ['-mavx']
 subprocess.check_call(clang + cflags + avx +
                       ['-c', stages] +
@@ -86,12 +102,23 @@ subprocess.check_call(clang + cflags + hsw + win +
                       ['-c', stages_lowp] +
                       ['-o', 'win_lowp_hsw.o'])
 
+subprocess.check_call(clang + cflags + hsw +
+                      ['-c', stages_8bit] +
+                      ['-o', '8bit_hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+                      ['-c', stages_8bit] +
+                      ['-o', 'win_8bit_hsw.o'])
+
 # iOS disallows the use of register x18,
 # so we need to use it as a least-common denominator.
 aarch64 = [ '--target=arm64-apple-ios' ]
 subprocess.check_call(clang + cflags + aarch64 +
                       ['-c', stages] +
                       ['-o', 'aarch64.o'])
+# TODO: need to work out relocations (adrp, lCPI, etc.)
+#subprocess.check_call(clang + cflags + aarch64 +
+#                      ['-c', stages_8bit] +
+#                      ['-o', '8bit_aarch64.o'])
 
 vfp4 = [
     '--target=armv7a-linux-gnueabihf',
@@ -100,6 +127,10 @@ vfp4 = [
 subprocess.check_call(clang + cflags + vfp4 +
                       ['-c', stages] +
                       ['-o', 'vfp4.o'])
+# TODO: should work fine... I just want to turn this one on separately from x86
+#subprocess.check_call(clang + cflags + vfp4 +
+#                      ['-c', stages_8bit] +
+#                      ['-o', '8bit_vfp4.o'])
 
 def parse_object_file(dot_o, directive, target=None):
   globl, hidden, label, comment, align = \
@@ -130,6 +161,7 @@ def parse_object_file(dot_o, directive, target=None):
                    '--insn-width=11',
                    '-j', '.text',
                    '-j', '.literal4',
+                   '-j', '.literal8',
                    '-j', '.literal16',
                    '-j', '.const',
                    dot_o]
@@ -193,6 +225,7 @@ print '#if defined(__MACH__)'
 print '    #define HIDDEN .private_extern'
 print '    #define FUNCTION(name)'
 print '    #define BALIGN4  .align 2'
+print '    #define BALIGN8  .align 3'
 print '    #define BALIGN16 .align 4'
 print '    #define BALIGN32 .align 5'
 print '#else'
@@ -200,6 +233,7 @@ print '    .section .note.GNU-stack,"",%progbits'
 print '    #define HIDDEN .hidden'
 print '    #define FUNCTION(name) .type name,%function'
 print '    #define BALIGN4  .balign 4'
+print '    #define BALIGN8  .balign 8'
 print '    #define BALIGN16 .balign 16'
 print '    #define BALIGN32 .balign 32'
 print '#endif'
@@ -207,11 +241,13 @@ print '#endif'
 print '.text'
 print '#if defined(__aarch64__)'
 print 'BALIGN4'
-parse_object_file('aarch64.o', '.long')
+parse_object_file(     'aarch64.o', '.long')
+#parse_object_file('8bit_aarch64.o', '.long')
 
 print '#elif defined(__arm__)'
 print 'BALIGN4'
-parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file(     'vfp4.o', '.long', target='elf32-littlearm')
+#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm')
 
 print '#elif defined(__x86_64__)'
 print 'BALIGN32'
@@ -223,9 +259,15 @@ parse_object_file('sse41.o', '.byte')
 print 'BALIGN32'
 parse_object_file('sse2.o',  '.byte')
 print 'BALIGN32'
-parse_object_file('lowp_hsw.o',  '.byte')
+parse_object_file('lowp_hsw.o', '.byte')
+print 'BALIGN32'
+parse_object_file('lowp_ssse3.o', '.byte')
+print 'BALIGN32'
+parse_object_file('8bit_hsw.o', '.byte')
+print 'BALIGN32'
+parse_object_file('8bit_sse41.o', '.byte')
 print 'BALIGN32'
-parse_object_file('lowp_ssse3.o',  '.byte')
+parse_object_file('8bit_sse2.o', '.byte')
 
 print '#elif defined(__i386__)'
 print 'BALIGN32'
@@ -253,9 +295,15 @@ parse_object_file('win_sse41.o', 'DB')
 print 'ALIGN 32'
 parse_object_file('win_sse2.o',  'DB')
 print 'ALIGN 32'
-parse_object_file('win_lowp_hsw.o',  'DB')
+parse_object_file('win_lowp_hsw.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_lowp_ssse3.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_8bit_hsw.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_8bit_sse41.o', 'DB')
 print 'ALIGN 32'
-parse_object_file('win_lowp_ssse3.o',  'DB')
+parse_object_file('win_8bit_sse2.o', 'DB')
 
 print 'ELSE'
 print '.MODEL FLAT,C'
author	Mike Klein <mtklein@chromium.org>	2017-08-02 14:25:55 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-08-03 01:54:58 +0000
commit	255607f094dac0f1c075a5b718005e7bd4d03e3d (patch)
tree	6ec199fa3aa57c136f0a3550ece3098635a94097 /src/jumper/build_stages.py
parent	20963ff2867fd6f0e2982bdfba26707a33349601 (diff)