3 files changed, 44 insertions, 12 deletions
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 8bf6e7759c..1552e0c181 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -12,15 +12,20 @@
 
 namespace {
 
-// ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it:
+// ARMv8 has vrndm(q)_f32 to floor floats.  Here we emulate it:
 //   - roundtrip through integers via truncation
 //   - subtract 1 if that's too big (possible for negative values).
 // This restricts the domain of our inputs to a maximum somehwere around 2^31.  Seems plenty big.
-AI static float32x4_t armv7_vrndmq_f32(float32x4_t v) {
+AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
     auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
     auto too_big = vcgtq_f32(roundtrip, v);
     return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
 }
+AI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
+    auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
+    auto too_big = vcgt_f32(roundtrip, v);
+    return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
+}
 
 template <>
 class SkNx<2, float> {
@@ -102,6 +107,13 @@ public:
     AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
 
     AI SkNx abs() const { return vabs_f32(fVec); }
+    AI SkNx floor() const {
+    #if defined(SK_CPU_ARM64)
+        return vrndm_f32(fVec);
+    #else
+        return emulate_vrndm_f32(fVec);
+    #endif
+    }
 
     AI SkNx rsqrt() const {
         float32x2_t est0 = vrsqrte_f32(fVec);
@@ -223,7 +235,7 @@ public:
     #if defined(SK_CPU_ARM64)
         return vrndmq_f32(fVec);
     #else
-        return armv7_vrndmq_f32(fVec);
+        return emulate_vrndmq_f32(fVec);
     #endif
     }
 
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index b8f175c919..1720f14fb8 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -15,6 +15,17 @@
 
 namespace {
 
+// Emulate _mm_floor_ps() with SSE2:
+//   - roundtrip through integers via truncation
+//   - subtract 1 if that's too big (possible for negative values).
+// This restricts the domain of our inputs to a maximum somehwere around 2^31.
+// Seems plenty big.
+AI static __m128 emulate_mm_floor_ps(__m128 v) {
+    __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
+    __m128 too_big = _mm_cmpgt_ps(roundtrip, v);
+    return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
+}
+
 template <>
 class SkNx<2, float> {
 public:
@@ -71,7 +82,15 @@ public:
     AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
     AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
 
-    AI SkNx    abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
+    AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
+    AI SkNx floor() const {
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+        return _mm_floor_ps(fVec);
+    #else
+        return emulate_mm_floor_ps(fVec);
+    #endif
+    }
+
     AI SkNx   sqrt() const { return _mm_sqrt_ps (fVec);  }
     AI SkNx  rsqrt() const { return _mm_rsqrt_ps(fVec); }
     AI SkNx invert() const { return _mm_rcp_ps(fVec); }
@@ -161,14 +180,7 @@ public:
     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
         return _mm_floor_ps(fVec);
     #else
-        // Emulate _mm_floor_ps() with SSE2:
-        //   - roundtrip through integers via truncation
-        //   - subtract 1 if that's too big (possible for negative values).
-        // This restricts the domain of our inputs to a maximum somehwere around 2^31.
-        // Seems plenty big.
-        __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec));
-        __m128 too_big = _mm_cmpgt_ps(roundtrip, fVec);
-        return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
+        return emulate_mm_floor_ps(fVec);
     #endif
     }
 
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index 933a5fc877..a454043936 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -257,6 +257,14 @@ DEF_TEST(SkNx_floor, r) {
     REPORTER_ASSERT(r, fs[1] == -1.0f);
     REPORTER_ASSERT(r, fs[2] ==  0.0f);
     REPORTER_ASSERT(r, fs[3] == -1.0f);
+
+    auto fs2 = Sk2f(0.4f, -0.4f).floor();
+    REPORTER_ASSERT(r, fs2[0] ==  0.0f);
+    REPORTER_ASSERT(r, fs2[1] == -1.0f);
+
+    auto fs3 = Sk2f(0.6f, -0.6f).floor();
+    REPORTER_ASSERT(r, fs3[0] ==  0.0f);
+    REPORTER_ASSERT(r, fs3[1] == -1.0f);
 }
 
 DEF_TEST(SkNx_shuffle, r) {