From f0348c2413c5c72820a42749879d41c6dd4ab16c Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Thu, 3 Nov 2016 14:43:48 -0400
Subject: Implement SkNx_fma() for Sk4f on ARMv8.

I was looking at the disassembly of matrix_4x5() and noticed it didn't have any FMAs.  This makes things that call SkNx_fma() actually use the FMA instruction.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=4400
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: Ia353a77b0ca14385a43b564997b05586f9472996
Reviewed-on: https://skia-review.googlesource.com/4400
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 src/opts/SkNx_neon.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/opts')

diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index b5d89891d1..c85d583ea2 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -218,6 +218,12 @@ public:
     float32x4_t fVec;
 };
 
+#if defined(SK_CPU_ARM64)
+    AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
+        return vfmaq_f32(a.fVec, f.fVec, m.fVec);
+    }
+#endif
+
 // It's possible that for our current use cases, representing this as
 // half a uint16x8_t might be better than representing it as a uint16x4_t.
 // It'd make conversion to Sk4b one step simpler.
-- 
cgit v1.2.3