diff options
-rw-r--r-- | src/opts/SkNx_neon.h | 18 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 30 | ||||
-rw-r--r-- | tests/SkNxTest.cpp | 8 |
3 files changed, 44 insertions, 12 deletions
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 8bf6e7759c..1552e0c181 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -12,15 +12,20 @@ namespace { -// ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: +// ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it: // - roundtrip through integers via truncation // - subtract 1 if that's too big (possible for negative values). // This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big. -AI static float32x4_t armv7_vrndmq_f32(float32x4_t v) { +AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) { auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); auto too_big = vcgtq_f32(roundtrip, v); return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); } +AI static float32x2_t emulate_vrndm_f32(float32x2_t v) { + auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v)); + auto too_big = vcgt_f32(roundtrip, v); + return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1))); +} template <> class SkNx<2, float> { @@ -102,6 +107,13 @@ public: AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); } AI SkNx abs() const { return vabs_f32(fVec); } + AI SkNx floor() const { + #if defined(SK_CPU_ARM64) + return vrndm_f32(fVec); + #else + return emulate_vrndm_f32(fVec); + #endif + } AI SkNx rsqrt() const { float32x2_t est0 = vrsqrte_f32(fVec); @@ -223,7 +235,7 @@ public: #if defined(SK_CPU_ARM64) return vrndmq_f32(fVec); #else - return armv7_vrndmq_f32(fVec); + return emulate_vrndmq_f32(fVec); #endif } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index b8f175c919..1720f14fb8 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -15,6 +15,17 @@ namespace { +// Emulate _mm_floor_ps() with SSE2: +// - roundtrip through integers via truncation +// - subtract 1 if that's too big (possible for negative values). +// This restricts the domain of our inputs to a maximum somehwere around 2^31. +// Seems plenty big. +AI static __m128 emulate_mm_floor_ps(__m128 v) { + __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); + __m128 too_big = _mm_cmpgt_ps(roundtrip, v); + return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f))); +} + template <> class SkNx<2, float> { public: @@ -71,7 +82,15 @@ public: AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } - AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } + AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } + AI SkNx floor() const { + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_floor_ps(fVec); + #else + return emulate_mm_floor_ps(fVec); + #endif + } + AI SkNx sqrt() const { return _mm_sqrt_ps (fVec); } AI SkNx rsqrt() const { return _mm_rsqrt_ps(fVec); } AI SkNx invert() const { return _mm_rcp_ps(fVec); } @@ -161,14 +180,7 @@ public: #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 return _mm_floor_ps(fVec); #else - // Emulate _mm_floor_ps() with SSE2: - // - roundtrip through integers via truncation - // - subtract 1 if that's too big (possible for negative values). - // This restricts the domain of our inputs to a maximum somehwere around 2^31. - // Seems plenty big. - __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec)); - __m128 too_big = _mm_cmpgt_ps(roundtrip, fVec); - return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f))); + return emulate_mm_floor_ps(fVec); #endif } diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp index 933a5fc877..a454043936 100644 --- a/tests/SkNxTest.cpp +++ b/tests/SkNxTest.cpp @@ -257,6 +257,14 @@ DEF_TEST(SkNx_floor, r) { REPORTER_ASSERT(r, fs[1] == -1.0f); REPORTER_ASSERT(r, fs[2] == 0.0f); REPORTER_ASSERT(r, fs[3] == -1.0f); + + auto fs2 = Sk2f(0.4f, -0.4f).floor(); + REPORTER_ASSERT(r, fs2[0] == 0.0f); + REPORTER_ASSERT(r, fs2[1] == -1.0f); + + auto fs3 = Sk2f(0.6f, -0.6f).floor(); + REPORTER_ASSERT(r, fs3[0] == 0.0f); + REPORTER_ASSERT(r, fs3[1] == -1.0f); } DEF_TEST(SkNx_shuffle, r) { |