1 files changed, 88 insertions, 150 deletions
diff --git a/third_party/skcms/skcms.cc b/third_party/skcms/skcms.cc
index 0a820629ad..3595088633 100644
--- a/third_party/skcms/skcms.cc
+++ b/third_party/skcms/skcms.cc
@@ -13,6 +13,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if defined(__ARM_NEON)
+    #include <arm_neon.h>
+#elif defined(__SSE__)
+    #include <immintrin.h>
+#endif
+
 // sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
 // We have better testing on 64-bit machines, so force 32-bit machines to behave like 64-bit.
 //
@@ -1806,174 +1812,108 @@ typedef enum {
 #endif
 
 #if defined(__clang__)
-    typedef float    __attribute__((ext_vector_type(4)))   Fx4;
-    typedef int32_t  __attribute__((ext_vector_type(4))) I32x4;
-    typedef uint64_t __attribute__((ext_vector_type(4))) U64x4;
-    typedef uint32_t __attribute__((ext_vector_type(4))) U32x4;
-    typedef uint16_t __attribute__((ext_vector_type(4))) U16x4;
-    typedef uint8_t  __attribute__((ext_vector_type(4)))  U8x4;
-
-    typedef float    __attribute__((ext_vector_type(8)))   Fx8;
-    typedef int32_t  __attribute__((ext_vector_type(8))) I32x8;
-    typedef uint64_t __attribute__((ext_vector_type(8))) U64x8;
-    typedef uint32_t __attribute__((ext_vector_type(8))) U32x8;
-    typedef uint16_t __attribute__((ext_vector_type(8))) U16x8;
-    typedef uint8_t  __attribute__((ext_vector_type(8)))  U8x8;
-
-    typedef float    __attribute__((ext_vector_type(16)))   Fx16;
-    typedef int32_t  __attribute__((ext_vector_type(16))) I32x16;
-    typedef uint64_t __attribute__((ext_vector_type(16))) U64x16;
-    typedef uint32_t __attribute__((ext_vector_type(16))) U32x16;
-    typedef uint16_t __attribute__((ext_vector_type(16))) U16x16;
-    typedef uint8_t  __attribute__((ext_vector_type(16)))  U8x16;
+    template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N)));
 #elif defined(__GNUC__)
-    typedef float    __attribute__((vector_size(16)))   Fx4;
-    typedef int32_t  __attribute__((vector_size(16))) I32x4;
-    typedef uint64_t __attribute__((vector_size(32))) U64x4;
-    typedef uint32_t __attribute__((vector_size(16))) U32x4;
-    typedef uint16_t __attribute__((vector_size( 8))) U16x4;
-    typedef uint8_t  __attribute__((vector_size( 4)))  U8x4;
-
-    typedef float    __attribute__((vector_size(32)))   Fx8;
-    typedef int32_t  __attribute__((vector_size(32))) I32x8;
-    typedef uint64_t __attribute__((vector_size(64))) U64x8;
-    typedef uint32_t __attribute__((vector_size(32))) U32x8;
-    typedef uint16_t __attribute__((vector_size(16))) U16x8;
-    typedef uint8_t  __attribute__((vector_size( 8)))  U8x8;
-
-    typedef float    __attribute__((vector_size( 64)))   Fx16;
-    typedef int32_t  __attribute__((vector_size( 64))) I32x16;
-    typedef uint64_t __attribute__((vector_size(128))) U64x16;
-    typedef uint32_t __attribute__((vector_size( 64))) U32x16;
-    typedef uint16_t __attribute__((vector_size( 32))) U16x16;
-    typedef uint8_t  __attribute__((vector_size( 16)))  U8x16;
+    // For some reason GCC accepts this nonsense, but not the more straightforward version,
+    //   template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T))));
+    template <int N, typename T>
+    struct VecHelper { typedef T __attribute__((vector_size(N*sizeof(T)))) V; };
+
+    template <int N, typename T> using Vec = typename VecHelper<N,T>::V;
 #endif
 
 // First, instantiate our default exec_ops() implementation using the default compiliation target.
 
+namespace baseline {
 #if defined(SKCMS_PORTABLE) || !(defined(__clang__) || defined(__GNUC__))
     #define N 1
-
-    #define F   float
-    #define U64 uint64_t
-    #define U32 uint32_t
-    #define I32 int32_t
-    #define U16 uint16_t
-    #define U8  uint8_t
-
-    #define F0 0.0f
-    #define F1 1.0f
+    using F   = float;
+    using U64 = uint64_t;
+    using U32 = uint32_t;
+    using I32 = int32_t;
+    using U16 = uint16_t;
+    using U8  = uint8_t;
 
 #elif defined(__AVX512F__)
     #define N 16
-
-    #define F     Fx16
-    #define U64 U64x16
-    #define U32 U32x16
-    #define I32 I32x16
-    #define U16 U16x16
-    #define U8   U8x16
-
-    #define F0 F{0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}
-    #define F1 F{1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}
+    using   F = Vec<N,float>;
+    using I32 = Vec<N,int32_t>;
+    using U64 = Vec<N,uint64_t>;
+    using U32 = Vec<N,uint32_t>;
+    using U16 = Vec<N,uint16_t>;
+    using  U8 = Vec<N,uint8_t>;
 #elif defined(__AVX__)
     #define N 8
-
-    #define F     Fx8
-    #define U64 U64x8
-    #define U32 U32x8
-    #define I32 I32x8
-    #define U16 U16x8
-    #define U8   U8x8
-
-    #define F0 F{0,0,0,0, 0,0,0,0}
-    #define F1 F{1,1,1,1, 1,1,1,1}
+    using   F = Vec<N,float>;
+    using I32 = Vec<N,int32_t>;
+    using U64 = Vec<N,uint64_t>;
+    using U32 = Vec<N,uint32_t>;
+    using U16 = Vec<N,uint16_t>;
+    using  U8 = Vec<N,uint8_t>;
 #else
     #define N 4
-
-    #define F     Fx4
-    #define U64 U64x4
-    #define U32 U32x4
-    #define I32 I32x4
-    #define U16 U16x4
-    #define U8   U8x4
-
-    #define F0 F{0,0,0,0}
-    #define F1 F{1,1,1,1}
+    using   F = Vec<N,float>;
+    using I32 = Vec<N,int32_t>;
+    using U64 = Vec<N,uint64_t>;
+    using U32 = Vec<N,uint32_t>;
+    using U16 = Vec<N,uint16_t>;
+    using  U8 = Vec<N,uint8_t>;
 #endif
 
-#define NS(id) id
-#define ATTR
+    #define ATTR
     #include "src/Transform_inl.h"
-#undef N
-#undef F
-#undef U64
-#undef U32
-#undef I32
-#undef U16
-#undef U8
-#undef F0
-#undef F1
-#undef NS
-#undef ATTR
+    #undef N
+    #undef ATTR
+}
 
 // Now, instantiate any other versions of run_program() we may want for runtime detection.
 #if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \
         && defined(__x86_64__) && !defined(__AVX2__)
-    #define N 8
-    #define F     Fx8
-    #define U64 U64x8
-    #define U32 U32x8
-    #define I32 I32x8
-    #define U16 U16x8
-    #define U8   U8x8
-    #define F0 F{0,0,0,0, 0,0,0,0}
-    #define F1 F{1,1,1,1, 1,1,1,1}
-
-    #define NS(id) id ## _hsw
-    #define ATTR __attribute__((target("avx2,f16c")))
-
-    // We check these guards to see if we have support for these features.
-    // They're likely _not_ defined here in our baseline build config.
-    #ifndef __AVX__
-        #define __AVX__ 1
-        #define UNDEF_AVX
-    #endif
-    #ifndef __F16C__
-        #define __F16C__ 1
-        #define UNDEF_F16C
-    #endif
-    #ifndef __AVX2__
-        #define __AVX2__ 1
-        #define UNDEF_AVX2
-    #endif
-
-    #include "src/Transform_inl.h"
-
-    #undef N
-    #undef F
-    #undef U64
-    #undef U32
-    #undef I32
-    #undef U16
-    #undef U8
-    #undef F0
-    #undef F1
-    #undef NS
-    #undef ATTR
 
-    #ifdef UNDEF_AVX
-        #undef __AVX__
-        #undef UNDEF_AVX
-    #endif
-    #ifdef UNDEF_F16C
-        #undef __F16C__
-        #undef UNDEF_F16C
-    #endif
-    #ifdef UNDEF_AVX2
-        #undef __AVX2__
-        #undef UNDEF_AVX2
-    #endif
+    namespace hsw {
+        #define N 8
+        using   F = Vec<N,float>;
+        using I32 = Vec<N,int32_t>;
+        using U64 = Vec<N,uint64_t>;
+        using U32 = Vec<N,uint32_t>;
+        using U16 = Vec<N,uint16_t>;
+        using  U8 = Vec<N,uint8_t>;
+
+        #define ATTR __attribute__((target("avx2,f16c")))
+
+        // We check these guards to see if we have support for these features.
+        // They're likely _not_ defined here in our baseline build config.
+        #ifndef __AVX__
+            #define __AVX__ 1
+            #define UNDEF_AVX
+        #endif
+        #ifndef __F16C__
+            #define __F16C__ 1
+            #define UNDEF_F16C
+        #endif
+        #ifndef __AVX2__
+            #define __AVX2__ 1
+            #define UNDEF_AVX2
+        #endif
+
+        #include "src/Transform_inl.h"
+
+        #undef N
+        #undef ATTR
+
+        #ifdef UNDEF_AVX
+            #undef __AVX__
+            #undef UNDEF_AVX
+        #endif
+        #ifdef UNDEF_F16C
+            #undef __F16C__
+            #undef UNDEF_F16C
+        #endif
+        #ifdef UNDEF_AVX2
+            #undef __AVX2__
+            #undef UNDEF_AVX2
+        #endif
+    }
 
     #define TEST_FOR_HSW
 
@@ -2319,11 +2259,9 @@ bool skcms_Transform(const void*             src,
         case skcms_PixelFormat_RGBA_ffff     >> 1: *ops++ = Op_store_ffff;     break;
     }
 
-    void (*run)(const Op*, const void**, const char*, char*, int, size_t,size_t) = run_program;
+    auto run = baseline::run_program;
 #if defined(TEST_FOR_HSW)
-    if (hsw_ok()) {
-        run = run_program_hsw;
-    }
+    if (hsw_ok()) { run = hsw::run_program; }
 #endif
     run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
     return true;