aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/Sk2x.h3
-rw-r--r--src/opts/Sk2x_neon.h24
-rw-r--r--src/opts/Sk2x_none.h3
-rw-r--r--src/opts/Sk2x_sse.h2
-rw-r--r--src/opts/Sk4x_neon.h14
5 files changed, 36 insertions, 10 deletions
diff --git a/src/core/Sk2x.h b/src/core/Sk2x.h
index a64ad721a5..9b4e5ee61c 100644
--- a/src/core/Sk2x.h
+++ b/src/core/Sk2x.h
@@ -49,14 +49,17 @@ public:
Sk2x add(const Sk2x&) const;
Sk2x subtract(const Sk2x&) const;
Sk2x multiply(const Sk2x&) const;
+ Sk2x divide(const Sk2x&) const;
Sk2x operator +(const Sk2x& o) const { return this->add(o); }
Sk2x operator -(const Sk2x& o) const { return this->subtract(o); }
Sk2x operator *(const Sk2x& o) const { return this->multiply(o); }
+ Sk2x operator /(const Sk2x& o) const { return this->divide(o); }
Sk2x& operator +=(const Sk2x& o) { return (*this = *this + o); }
Sk2x& operator -=(const Sk2x& o) { return (*this = *this - o); }
Sk2x& operator *=(const Sk2x& o) { return (*this = *this * o); }
+ Sk2x& operator /=(const Sk2x& o) { return (*this = *this / o); }
Sk2x negate() const { return Sk2x((T)0) - *this; }
Sk2x operator -() const { return this->negate(); }
diff --git a/src/opts/Sk2x_neon.h b/src/opts/Sk2x_neon.h
index 00ab00aeaa..ef61df4823 100644
--- a/src/opts/Sk2x_neon.h
+++ b/src/opts/Sk2x_neon.h
@@ -41,6 +41,16 @@ M(void) store(float vals[2]) const { vst1_f32(vals, fVec); }
M(Sk2f) add(const Sk2f& o) const { return vadd_f32(fVec, o.fVec); }
M(Sk2f) subtract(const Sk2f& o) const { return vsub_f32(fVec, o.fVec); }
M(Sk2f) multiply(const Sk2f& o) const { return vmul_f32(fVec, o.fVec); }
+M(Sk2f) divide(const Sk2f& o) const {
+#if defined(SK_CPU_ARM64)
+ return vdiv_f32(fVec, o.fVec);
+#else
+ float32x2_t est0 = vrecpe_f32(o.fVec),
+ est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
+ est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
+ return vmul_f32(est2, fVec);
+#endif
+}
M(Sk2f) Min(const Sk2f& a, const Sk2f& b) { return vmin_f32(a.fVec, b.fVec); }
M(Sk2f) Max(const Sk2f& a, const Sk2f& b) { return vmax_f32(a.fVec, b.fVec); }
@@ -51,10 +61,14 @@ M(Sk2f) rsqrt() const {
return est1;
}
M(Sk2f) sqrt() const {
+#if defined(SK_CPU_ARM64)
+ return vsqrt_f32(fVec);
+#else
float32x2_t est1 = this->rsqrt().fVec,
// An extra step of Newton's method to refine the estimate of 1/sqrt(this).
est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
return vmul_f32(fVec, est2);
+#endif
}
#undef M
@@ -73,6 +87,7 @@ M(Sk2f) sqrt() const {
M(Sk2d) add(const Sk2d& o) const { return vaddq_f64(fVec, o.fVec); }
M(Sk2d) subtract(const Sk2d& o) const { return vsubq_f64(fVec, o.fVec); }
M(Sk2d) multiply(const Sk2d& o) const { return vmulq_f64(fVec, o.fVec); }
+ M(Sk2d) divide(const Sk2d& o) const { return vdivq_f64(fVec, o.fVec); }
M(Sk2d) Min(const Sk2d& a, const Sk2d& b) { return vminq_f64(a.fVec, b.fVec); }
M(Sk2d) Max(const Sk2d& a, const Sk2d& b) { return vmaxq_f64(a.fVec, b.fVec); }
@@ -82,13 +97,7 @@ M(Sk2f) sqrt() const {
est1 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0);
return est1;
}
- M(Sk2d) sqrt() const {
- float64x2_t est1 = this->rsqrt().fVec,
- // Two extra steps of Newton's method to refine the estimate of 1/sqrt(this).
- est2 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est1, est1)), est1),
- est3 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est2, est2)), est2);
- return vmulq_f64(fVec, est3);
- }
+ M(Sk2d) sqrt() const { return vsqrtq_f64(fVec); }
#else // Scalar implementation for 32-bit chips, which don't have float64x2_t.
M() Sk2x() {}
@@ -106,6 +115,7 @@ M(Sk2f) sqrt() const {
M(Sk2d) add(const Sk2d& o) const { return Sk2d(fVec[0] + o.fVec[0], fVec[1] + o.fVec[1]); }
M(Sk2d) subtract(const Sk2d& o) const { return Sk2d(fVec[0] - o.fVec[0], fVec[1] - o.fVec[1]); }
M(Sk2d) multiply(const Sk2d& o) const { return Sk2d(fVec[0] * o.fVec[0], fVec[1] * o.fVec[1]); }
+ M(Sk2d) divide(const Sk2d& o) const { return Sk2d(fVec[0] / o.fVec[0], fVec[1] / o.fVec[1]); }
M(Sk2d) Min(const Sk2d& a, const Sk2d& b) {
return Sk2d(SkTMin(a.fVec[0], b.fVec[0]), SkTMin(a.fVec[1], b.fVec[1]));
diff --git a/src/opts/Sk2x_none.h b/src/opts/Sk2x_none.h
index 12daffd72f..65cc6703fb 100644
--- a/src/opts/Sk2x_none.h
+++ b/src/opts/Sk2x_none.h
@@ -43,6 +43,9 @@ M(Sk2x<T>) subtract(const Sk2x<T>& o) const {
M(Sk2x<T>) multiply(const Sk2x<T>& o) const {
return Sk2x<T>(fVec[0] * o.fVec[0], fVec[1] * o.fVec[1]);
}
+M(Sk2x<T>) divide(const Sk2x<T>& o) const {
+ return Sk2x<T>(fVec[0] / o.fVec[0], fVec[1] / o.fVec[1]);
+}
M(Sk2x<T>) Min(const Sk2x<T>& a, const Sk2x<T>& b) {
return Sk2x<T>(SkTMin(a.fVec[0], b.fVec[0]), SkTMin(a.fVec[1], b.fVec[1]));
diff --git a/src/opts/Sk2x_sse.h b/src/opts/Sk2x_sse.h
index 71071c08c7..111d3c230e 100644
--- a/src/opts/Sk2x_sse.h
+++ b/src/opts/Sk2x_sse.h
@@ -38,6 +38,7 @@ M(void) store(float vals[2]) const { _mm_storel_pi((__m64*)vals, fVec); }
M(Sk2f) add(const Sk2f& o) const { return _mm_add_ps(fVec, o.fVec); }
M(Sk2f) subtract(const Sk2f& o) const { return _mm_sub_ps(fVec, o.fVec); }
M(Sk2f) multiply(const Sk2f& o) const { return _mm_mul_ps(fVec, o.fVec); }
+M(Sk2f) divide(const Sk2f& o) const { return _mm_div_ps(fVec, o.fVec); }
M(Sk2f) Min(const Sk2f& a, const Sk2f& b) { return _mm_min_ps(a.fVec, b.fVec); }
M(Sk2f) Max(const Sk2f& a, const Sk2f& b) { return _mm_max_ps(a.fVec, b.fVec); }
@@ -60,6 +61,7 @@ M(void) store(double vals[2]) const { _mm_storeu_pd(vals, fVec); }
M(Sk2d) add(const Sk2d& o) const { return _mm_add_pd(fVec, o.fVec); }
M(Sk2d) subtract(const Sk2d& o) const { return _mm_sub_pd(fVec, o.fVec); }
M(Sk2d) multiply(const Sk2d& o) const { return _mm_mul_pd(fVec, o.fVec); }
+M(Sk2d) divide(const Sk2d& o) const { return _mm_div_pd(fVec, o.fVec); }
M(Sk2d) Min(const Sk2d& a, const Sk2d& b) { return _mm_min_pd(a.fVec, b.fVec); }
M(Sk2d) Max(const Sk2d& a, const Sk2d& b) { return _mm_max_pd(a.fVec, b.fVec); }
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
index 92cde11711..41f6f46885 100644
--- a/src/opts/Sk4x_neon.h
+++ b/src/opts/Sk4x_neon.h
@@ -62,10 +62,14 @@ M(Sk4f) subtract(const Sk4f& o) const { return vsubq_f32(fVec, o.fVec); }
M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }
M(Sk4f) divide (const Sk4f& o) const {
- float32x4_t est0 = vrecpeq_f32(o.fVec);
- float32x4_t est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0);
- float32x4_t est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
+#if defined(SK_CPU_ARM64)
+ return vdivq_f32(fVec, o.fVec);
+#else
+ float32x4_t est0 = vrecpeq_f32(o.fVec),
+ est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
+ est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
return vmulq_f32(est2, fVec);
+#endif
}
M(Sk4f) rsqrt() const {
@@ -75,10 +79,14 @@ M(Sk4f) rsqrt() const {
}
M(Sk4f) sqrt() const {
+#if defined(SK_CPU_ARM64)
+ return vsqrtq_f32(fVec);
+#else
float32x4_t est1 = this->rsqrt().fVec,
// An extra step of Newton's method to refine the estimate of 1/sqrt(this).
est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
return vmulq_f32(fVec, est2);
+#endif
}
M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); }