aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages_8bit.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper/SkJumper_stages_8bit.cpp')
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp41
1 files changed, 29 insertions, 12 deletions
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..0c019f8fbc 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
* found in the LICENSE file.
*/
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels. This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
#include "SkJumper.h"
#include "SkJumper_misc.h"
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+ #include <arm_neon.h>
+#else
#include <immintrin.h>
#endif
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels. This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
#if !defined(JUMPER_IS_OFFLINE)
- #error "This file must be pre-compiled."
+ #define WRAP(name) sk_##name##_8bit
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
- #define WRAP(name) sk_##name##_vfp4_8bit
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_8bit
#elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
template <typename T>
SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
b_lo, b_hi;
split(a.u8x4, &a_lo, &a_hi);
split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+ return join(vqaddq_u8(a_lo, b_lo),
+ vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
return join(_mm256_adds_epu8(a_lo, b_lo),
_mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
return join(_mm_adds_epu8(a_lo, b_lo),
_mm_adds_epu8(a_hi, b_hi));
#endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
void** program, const SkJumper_constants*) {
- R r;
+#if defined(JUMPER_IS_OFFLINE)
+ R r; // Fastest to start uninitialized.
+#else
+ R r{}; // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
auto start = (Stage*)load_and_inc(program);
for (; y < ylimit; y++) {
Params params = { x,y,0 };
@@ -223,6 +234,7 @@ SI V load(const T* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
V v = 0;
switch (tail) {
+ #if defined(__AVX2__)
case 15: v[14] = src[14];
case 14: v[13] = src[13];
case 13: v[12] = src[12];
@@ -231,6 +243,7 @@ SI V load(const T* src, size_t tail) {
case 10: v[ 9] = src[ 9];
case 9: v[ 8] = src[ 8];
case 8: memcpy(&v, src, 8*sizeof(T)); break;
+ #endif
case 7: v[6] = src[6];
case 6: v[5] = src[5];
case 5: v[4] = src[4];
@@ -249,6 +262,7 @@ SI void store(T* dst, V v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
+ #if defined(__AVX2__)
case 15: dst[14] = v[14];
case 14: dst[13] = v[13];
case 13: dst[12] = v[12];
@@ -257,6 +271,7 @@ SI void store(T* dst, V v, size_t tail) {
case 10: dst[ 9] = v[ 9];
case 9: dst[ 8] = v[ 8];
case 8: memcpy(dst, &v, 8*sizeof(T)); break;
+ #endif
case 7: dst[6] = v[6];
case 6: dst[5] = v[5];
case 5: dst[4] = v[4];
@@ -461,3 +476,5 @@ STAGE(overlay) {
// colorburn |
// colordodge > these involve division, which makes them (much) slower than the float stages.
// softlight |
+
+#endif