aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/Sk4x_neon.h
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-03-19 12:08:29 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-03-19 12:08:29 -0700
commit5553bedf603e1f1a55d2fd6c70c2ca3a831b3801 (patch)
tree2b50af8ca66b063c74aa0e5c775463df607fdcc0 /src/opts/Sk4x_neon.h
parent950e986b1bc127af1f484572d2494091957486f9 (diff)
Sk2x for NEON
Also decreases the precision of Sk4f::rsqrt() for speed, keeping Sk4f::sqrt() the same: instead of doing two estimation steps in rsqrt(), do one there and one more in sqrt(). Tests pass on my Nexus 7. float64x2_t is still a TODO for when I get a hold of a Nexus 9. BUG=skia: Review URL: https://codereview.chromium.org/1018423003
Diffstat (limited to 'src/opts/Sk4x_neon.h')
-rw-r--r--src/opts/Sk4x_neon.h14
1 files changed, 9 insertions, 5 deletions
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
index 2851fb31a4..3f35fe785b 100644
--- a/src/opts/Sk4x_neon.h
+++ b/src/opts/Sk4x_neon.h
@@ -82,13 +82,17 @@ M(Sk4f) divide (const Sk4f& o) const {
}
M(Sk4f) rsqrt() const {
- float32x4_t est0 = vrsqrteq_f32(fVec);
- float32x4_t est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
- float32x4_t est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
- return est2;
+ float32x4_t est0 = vrsqrteq_f32(fVec),
+ est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
+ return est1;
}
-M(Sk4f) sqrt() const { return this->multiply(this->rsqrt()); }
+M(Sk4f) sqrt() const {
+ float32x4_t est1 = this->rsqrt().fVec,
+ // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
+ est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
+ return vmulq_f32(fVec, est2);
+}
M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); }
M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); }