From 106e17aa38c9cf90c864232ae8dc7667f37bb787 Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Tue, 12 Dec 2017 17:07:49 -0500
Subject: JUMPER_IS_AVX2 -> JUMPER_IS_HSW

We need to be a bit more pedantic here to support builds that may be
using AVX2 as part of their baseline but perhaps not enabling all the
related features SkJumper would like to use.

E.g. we've seen Tensorflow build with AVX2 and FMA, but not F16C.

So check all three {AVX2,FMA,F16C}, and only then build stages in HSW
mode.  I've updated the define as a reminder.

This only affects builds using these features for their _baseline_
stages... the offline-compiled stages in SkJumper_generated.S are
not affected.

Change-Id: I9bfb3bae3589d35043b748782cefa8c213726d6a
Reviewed-on: https://skia-review.googlesource.com/84221
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 src/jumper/SkJumper_stages.cpp |  4 ++--
 src/jumper/SkJumper_vectors.h  | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'src/jumper')

diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index e7ffe6bbff..8b2e01d81b 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -624,7 +624,7 @@ STAGE(to_srgb, Ctx::None) {
         const float c = 1.130026340485f,
                     d = 0.141387879848f;
     #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \
-          defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_AVX2 )
+          defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_HSW )
         const float c = 1.130048394203f,
                     d = 0.141357362270f;
     #elif defined(JUMPER_IS_NEON)
@@ -1162,7 +1162,7 @@ STAGE(matrix_perspective, const float* m) {
 SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
                         F* r, F* g, F* b, F* a) {
     F fr, br, fg, bg, fb, bb, fa, ba;
-#if defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
     if (c->stopCount <=8) {
         fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
         br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index 5bae17cba8..0557d28199 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -22,8 +22,8 @@
     #define JUMPER_IS_NEON
 #elif defined(__AVX512F__)
     #define JUMPER_IS_AVX512
-#elif defined(__AVX2__)
-    #define JUMPER_IS_AVX2
+#elif defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
+    #define JUMPER_IS_HSW
 #elif defined(__AVX__)
     #define JUMPER_IS_AVX
 #elif defined(__SSE4_1__)
@@ -216,7 +216,7 @@
         }
     }
 
-#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
     #include <immintrin.h>
 
     // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -229,7 +229,7 @@
     using U8  = V<uint8_t >;
 
     SI F mad(F f, F m, F a)  {
-    #if defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+    #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
         return _mm256_fmadd_ps(f,m,a);
     #else
         return f*m+a;
@@ -261,7 +261,7 @@
         return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
                  p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
     }
-    #if defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+    #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
         SI F   gather(const float*    p, U32 ix) { return _mm256_i32gather_ps   (p, ix, 4); }
         SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
         SI U64 gather(const uint64_t* p, U32 ix) {
@@ -658,7 +658,7 @@ SI F from_half(U16 h) {
 #if defined(__aarch64__)
     return vcvt_f32_f16(h);
 
-#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
     return _mm256_cvtph_ps(h);
 
 #else
@@ -678,7 +678,7 @@ SI U16 to_half(F f) {
 #if defined(__aarch64__)
     return vcvt_f16_f32(f);
 
-#elif defined(JUMPER_IS_AVX2) || defined(JUMPER_IS_AVX512)
+#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
 
 #else
-- 
cgit v1.2.3