diff options
author | Mike Klein <mtklein@chromium.org> | 2017-08-28 17:53:34 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-08-29 18:27:51 +0000 |
commit | b561b764d894260b77d3c44f8fa182802897f2e1 (patch) | |
tree | 3d3e22e7f0c76bbb7775dba0e566aee28d3322a2 /src/jumper | |
parent | fe75930ce0b8d9451d29162942badfd568a1ec47 (diff) |
use NEON 8-bit stages on ARMv7 too
We don't really use anything very ARMv8 specific in the 8-bit NEON
stages, so we can just naturally extend what we're doing to ARMv7 too.
Note that unlike the float stages, we're not requiring VFPv4 either,
just NEON. VFPv4 is for FMA and F16<->F32 conversion, both of which are
unnecessary for the integer pipeline.
GMs and perf improvement are similar to the previous ARMv8 change.
Change-Id: Id618801ea1920564c1deee144a640a4133c4505f
Reviewed-on: https://skia-review.googlesource.com/39840
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.cpp | 16 | ||||
-rw-r--r-- | src/jumper/SkJumper.h | 9 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 8 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 9 |
4 files changed, 22 insertions, 20 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 315110faf2..8f3e6a749f 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -175,7 +175,7 @@ extern "C" { SK_RASTER_PIPELINE_STAGES(M) #undef M -#if defined(__clang__) && defined(__aarch64__) +#if defined(JUMPER_HAS_NEON_8BIT) // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang. StartPipelineFn sk_start_pipeline_8bit; StageFn sk_just_return_8bit; @@ -208,13 +208,13 @@ extern "C" { } LOWP_STAGES(M) #undef M -#elif defined(__clang__) && defined(__aarch64__) +#elif defined(JUMPER_HAS_NEON_8BIT) template <SkRasterPipeline::StockStage st> - static constexpr StageFn* aarch64_8bit() { return nullptr; } + static constexpr StageFn* neon_8bit() { return nullptr; } - #define M(st) \ - template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \ - return sk_##st##_8bit; \ + #define M(st) \ + template <> constexpr StageFn* neon_8bit<SkRasterPipeline::st>() { \ + return sk_##st##_8bit; \ } LOWP_STAGES(M) #undef M @@ -346,9 +346,9 @@ static SkJumper_Engine choose_engine() { #undef M }; } - #elif defined(__clang__) && defined(__aarch64__) + #elif defined(JUMPER_HAS_NEON_8BIT) return { - #define M(st) aarch64_8bit<SkRasterPipeline::st>(), + #define M(st) neon_8bit<SkRasterPipeline::st>(), { SK_RASTER_PIPELINE_STAGES(M) }, sk_start_pipeline_8bit, sk_just_return_8bit, diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h index 20b8d32aba..4bb851f939 100644 --- a/src/jumper/SkJumper.h +++ b/src/jumper/SkJumper.h @@ -50,6 +50,15 @@ #include <stdint.h> #endif +// When compiled with Clang on ARM, we'll have 8-bit NEON stages. +#if defined(__clang__) + #if defined(__aarch64__) + #define JUMPER_HAS_NEON_8BIT + #elif defined(__arm__) && defined(__ARM_NEON__) + #define JUMPER_HAS_NEON_8BIT + #endif +#endif + static const int SkJumper_kMaxStride = 8; struct SkJumper_constants { diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 0c019f8fbc..b6d94e3bed 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -14,9 +14,9 @@ // As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code. // Any other platform (so far) is offline-only. -#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__)) +#if defined(JUMPER_IS_OFFLINE) || defined(JUMPER_HAS_NEON_8BIT) -#if defined(__aarch64__) +#if defined(JUMPER_HAS_NEON_8BIT) #include <arm_neon.h> #else #include <immintrin.h> @@ -24,8 +24,6 @@ #if !defined(JUMPER_IS_OFFLINE) #define WRAP(name) sk_##name##_8bit -#elif defined(__aarch64__) - #define WRAP(name) sk_##name##_aarch64_8bit #elif defined(__AVX2__) #define WRAP(name) sk_##name##_hsw_8bit #elif defined(__SSE4_1__) @@ -166,7 +164,7 @@ SI V saturated_add(V a, V b) { b_lo, b_hi; split(a.u8x4, &a_lo, &a_hi); split(b.u8x4, &b_lo, &b_hi); -#if defined(__aarch64__) +#if defined(JUMPER_HAS_NEON_8BIT) return join(vqaddq_u8(a_lo, b_lo), vqaddq_u8(a_hi, b_hi)); #elif defined(__AVX2__) diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index 688ad60706..728b0a51c4 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -109,10 +109,6 @@ vfp4 = [ subprocess.check_call(clang + cflags + vfp4 + ['-c', stages] + ['-o', 'vfp4.o']) -# TODO: should work fine... I just want to turn this one on separately from x86 -#subprocess.check_call(clang + cflags + vfp4 + -# ['-c', stages_8bit] + -# ['-o', '8bit_vfp4.o']) def parse_object_file(dot_o, directive, target=None): globl, hidden, label, comment, align = \ @@ -223,12 +219,11 @@ print '#endif' print '.text' print '#if defined(__arm__)' print 'BALIGN4' -parse_object_file( 'vfp4.o', '.long', target='elf32-littlearm') -#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm') +parse_object_file('vfp4.o', '.long', target='elf32-littlearm') print '#elif defined(__x86_64__)' print 'BALIGN32' -parse_object_file('merged.o', '.byte') +parse_object_file('merged.o', '.byte') print '#elif defined(__i386__)' print 'BALIGN32' |