diff options
author | 2015-03-19 12:08:29 -0700 | |
---|---|---|
committer | 2015-03-19 12:08:29 -0700 | |
commit | 5553bedf603e1f1a55d2fd6c70c2ca3a831b3801 (patch) | |
tree | 2b50af8ca66b063c74aa0e5c775463df607fdcc0 /src/opts/Sk4x_neon.h | |
parent | 950e986b1bc127af1f484572d2494091957486f9 (diff) |
Sk2x for NEON
Also decreases the precision of Sk4f::rsqrt() for speed, keeping Sk4f::sqrt() the same:
instead of doing two estimation steps in rsqrt(), do one there and one more in sqrt().
Tests pass on my Nexus 7. float64x2_t is still a TODO for when I get a hold of a Nexus 9.
BUG=skia:
Review URL: https://codereview.chromium.org/1018423003
Diffstat (limited to 'src/opts/Sk4x_neon.h')
-rw-r--r-- | src/opts/Sk4x_neon.h | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h index 2851fb31a4..3f35fe785b 100644 --- a/src/opts/Sk4x_neon.h +++ b/src/opts/Sk4x_neon.h @@ -82,13 +82,17 @@ M(Sk4f) divide (const Sk4f& o) const { } M(Sk4f) rsqrt() const { - float32x4_t est0 = vrsqrteq_f32(fVec); - float32x4_t est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); - float32x4_t est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); - return est2; + float32x4_t est0 = vrsqrteq_f32(fVec), + est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); + return est1; } -M(Sk4f) sqrt() const { return this->multiply(this->rsqrt()); } +M(Sk4f) sqrt() const { + float32x4_t est1 = this->rsqrt().fVec, + // An extra step of Newton's method to refine the estimate of 1/sqrt(this). + est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); + return vmulq_f32(fVec, est2); +} M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); } M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); } |