aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-08-28 15:51:46 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-08-28 21:04:07 +0000
commit08133583d5e1cdfdcc41b4bb078fcfb64137f058 (patch)
tree213dbefcb5d7b2020eed2c19b6f33c281f6548d4 /src/jumper
parentc5b2c86cd1fca9f17d814ae750378843ca0ce216 (diff)
8-bit jumper on armv8
The GM diffs are all minor and what you'd expect. I did a quick performance sanity check, which also looks fine. $ out/ok bench rp filter:search=Modulate [blendmode_rect_Modulate] 30.2ms @0 32ms @95 32ms @100 [blendmode_mask_Modulate] 12.6ms @0 12.6ms @95 14.5ms @100 ~~~> [blendmode_rect_Modulate] 11.2ms @0 11.7ms @95 12.4ms @100 [blendmode_mask_Modulate] 10.5ms @0 23.6ms @95 23.9ms @100 This isn't even really the fastest we can make 8-bit go on ARMv8; it's actually much more natural to work de-interlaced there. Lots of room to follow up. Change-Id: I86b1099f6742bcb0b8b4fa153e85eaba9567cbf7 Reviewed-on: https://skia-review.googlesource.com/39740 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper.cpp42
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp37
2 files changed, 60 insertions, 19 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 9f8e970f32..315110faf2 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);
extern "C" {
#if __has_feature(memory_sanitizer)
- // We'll just run portable code.
+ // We'll just run baseline code.
#elif defined(__arm__)
StartPipelineFn ASM(start_pipeline,vfp4);
@@ -168,12 +168,22 @@ extern "C" {
#endif
- // Portable, single-pixel stages.
+ // Baseline code compiled as a normal part of Skia.
StartPipelineFn sk_start_pipeline;
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
SK_RASTER_PIPELINE_STAGES(M)
#undef M
+
+#if defined(__clang__) && defined(__aarch64__)
+ // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
+ StartPipelineFn sk_start_pipeline_8bit;
+ StageFn sk_just_return_8bit;
+ #define M(st) StageFn sk_##st##_8bit;
+ SK_RASTER_PIPELINE_STAGES(M)
+ #undef M
+#endif
+
}
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
@@ -198,6 +208,16 @@ extern "C" {
}
LOWP_STAGES(M)
#undef M
+#elif defined(__clang__) && defined(__aarch64__)
+ template <SkRasterPipeline::StockStage st>
+ static constexpr StageFn* aarch64_8bit() { return nullptr; }
+
+ #define M(st) \
+ template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \
+ return sk_##st##_8bit; \
+ }
+ LOWP_STAGES(M)
+ #undef M
#endif
// Engines comprise everything we need to run SkRasterPipelines.
@@ -207,20 +227,20 @@ struct SkJumper_Engine {
StageFn* just_return;
};
-// We'll default to this portable engine, but try to choose a better one at runtime.
-static const SkJumper_Engine kPortable = {
+// We'll default to this baseline engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kBaseline = {
#define M(stage) sk_##stage,
{ SK_RASTER_PIPELINE_STAGES(M) },
#undef M
sk_start_pipeline,
sk_just_return,
};
-static SkJumper_Engine gEngine = kPortable;
+static SkJumper_Engine gEngine = kBaseline;
static SkOnce gChooseEngineOnce;
static SkJumper_Engine choose_engine() {
#if __has_feature(memory_sanitizer)
- // We'll just run portable code.
+ // We'll just run baseline code.
#elif defined(__arm__)
if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
@@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() {
}
#endif
- return kPortable;
+ return kBaseline;
}
#ifndef SK_JUMPER_DISABLE_8BIT
@@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() {
#undef M
};
}
+ #elif defined(__clang__) && defined(__aarch64__)
+ return {
+ #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+ { SK_RASTER_PIPELINE_STAGES(M) },
+ sk_start_pipeline_8bit,
+ sk_just_return_8bit,
+ #undef M
+ };
#endif
return kNone;
}
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..edd6689c8c 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
* found in the LICENSE file.
*/
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels. This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
#include "SkJumper.h"
#include "SkJumper_misc.h"
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+ #include <arm_neon.h>
+#else
#include <immintrin.h>
#endif
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels. This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
#if !defined(JUMPER_IS_OFFLINE)
- #error "This file must be pre-compiled."
+ #define WRAP(name) sk_##name##_8bit
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
- #define WRAP(name) sk_##name##_vfp4_8bit
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_8bit
#elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
template <typename T>
SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
b_lo, b_hi;
split(a.u8x4, &a_lo, &a_hi);
split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+ return join(vqaddq_u8(a_lo, b_lo),
+ vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
return join(_mm256_adds_epu8(a_lo, b_lo),
_mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
return join(_mm_adds_epu8(a_lo, b_lo),
_mm_adds_epu8(a_hi, b_hi));
#endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
void** program, const SkJumper_constants*) {
- R r;
+#if defined(JUMPER_IS_OFFLINE)
+ R r; // Fastest to start uninitialized.
+#else
+ R r{}; // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
auto start = (Stage*)load_and_inc(program);
for (; y < ylimit; y++) {
Params params = { x,y,0 };
@@ -461,3 +472,5 @@ STAGE(overlay) {
// colorburn |
// colordodge > these involve division, which makes them (much) slower than the float stages.
// softlight |
+
+#endif