aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com <skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com>2018-07-30 13:00:30 +0000
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2018-07-30 13:35:45 +0000
commit4241bab68aa2206a7be63d76411d1f9a39aa4dab (patch)
tree83404f2e3598b5e165e5a7d601ba59144069000f
parentc06b8a4e0ca3844afbbbff545730f96d31fab897 (diff)
Roll skia/third_party/skcms 23e7777f421d..51fba282d9a0 (1 commits)
https://skia.googlesource.com/skcms.git/+log/23e7777f421d..51fba282d9a0 2018-07-30 mtklein@google.com start cleaning up src/Transform_inl.h with C++ The AutoRoll server is located here: https://skcms-skia-roll.skia.org Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=master.tryserver.blink:linux_trusty_blink_rel TBR=scroggo@google.com Change-Id: I3005cfe39088dae5564a8ddd9fffd6eefd122065 Reviewed-on: https://skia-review.googlesource.com/144296 Reviewed-by: skcms-skia-autoroll <skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com> Commit-Queue: skcms-skia-autoroll <skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com>
-rw-r--r--third_party/skcms/skcms.cc238
-rw-r--r--third_party/skcms/src/Transform_inl.h196
-rwxr-xr-xthird_party/skcms/version.sha12
3 files changed, 174 insertions, 262 deletions
diff --git a/third_party/skcms/skcms.cc b/third_party/skcms/skcms.cc
index 0a820629ad..3595088633 100644
--- a/third_party/skcms/skcms.cc
+++ b/third_party/skcms/skcms.cc
@@ -13,6 +13,12 @@
#include <stdlib.h>
#include <string.h>
+#if defined(__ARM_NEON)
+ #include <arm_neon.h>
+#elif defined(__SSE__)
+ #include <immintrin.h>
+#endif
+
// sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
// We have better testing on 64-bit machines, so force 32-bit machines to behave like 64-bit.
//
@@ -1806,174 +1812,108 @@ typedef enum {
#endif
#if defined(__clang__)
- typedef float __attribute__((ext_vector_type(4))) Fx4;
- typedef int32_t __attribute__((ext_vector_type(4))) I32x4;
- typedef uint64_t __attribute__((ext_vector_type(4))) U64x4;
- typedef uint32_t __attribute__((ext_vector_type(4))) U32x4;
- typedef uint16_t __attribute__((ext_vector_type(4))) U16x4;
- typedef uint8_t __attribute__((ext_vector_type(4))) U8x4;
-
- typedef float __attribute__((ext_vector_type(8))) Fx8;
- typedef int32_t __attribute__((ext_vector_type(8))) I32x8;
- typedef uint64_t __attribute__((ext_vector_type(8))) U64x8;
- typedef uint32_t __attribute__((ext_vector_type(8))) U32x8;
- typedef uint16_t __attribute__((ext_vector_type(8))) U16x8;
- typedef uint8_t __attribute__((ext_vector_type(8))) U8x8;
-
- typedef float __attribute__((ext_vector_type(16))) Fx16;
- typedef int32_t __attribute__((ext_vector_type(16))) I32x16;
- typedef uint64_t __attribute__((ext_vector_type(16))) U64x16;
- typedef uint32_t __attribute__((ext_vector_type(16))) U32x16;
- typedef uint16_t __attribute__((ext_vector_type(16))) U16x16;
- typedef uint8_t __attribute__((ext_vector_type(16))) U8x16;
+ template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N)));
#elif defined(__GNUC__)
- typedef float __attribute__((vector_size(16))) Fx4;
- typedef int32_t __attribute__((vector_size(16))) I32x4;
- typedef uint64_t __attribute__((vector_size(32))) U64x4;
- typedef uint32_t __attribute__((vector_size(16))) U32x4;
- typedef uint16_t __attribute__((vector_size( 8))) U16x4;
- typedef uint8_t __attribute__((vector_size( 4))) U8x4;
-
- typedef float __attribute__((vector_size(32))) Fx8;
- typedef int32_t __attribute__((vector_size(32))) I32x8;
- typedef uint64_t __attribute__((vector_size(64))) U64x8;
- typedef uint32_t __attribute__((vector_size(32))) U32x8;
- typedef uint16_t __attribute__((vector_size(16))) U16x8;
- typedef uint8_t __attribute__((vector_size( 8))) U8x8;
-
- typedef float __attribute__((vector_size( 64))) Fx16;
- typedef int32_t __attribute__((vector_size( 64))) I32x16;
- typedef uint64_t __attribute__((vector_size(128))) U64x16;
- typedef uint32_t __attribute__((vector_size( 64))) U32x16;
- typedef uint16_t __attribute__((vector_size( 32))) U16x16;
- typedef uint8_t __attribute__((vector_size( 16))) U8x16;
+ // For some reason GCC accepts this nonsense, but not the more straightforward version,
+ // template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T))));
+ template <int N, typename T>
+ struct VecHelper { typedef T __attribute__((vector_size(N*sizeof(T)))) V; };
+
+ template <int N, typename T> using Vec = typename VecHelper<N,T>::V;
#endif
// First, instantiate our default exec_ops() implementation using the default compiliation target.
+namespace baseline {
#if defined(SKCMS_PORTABLE) || !(defined(__clang__) || defined(__GNUC__))
#define N 1
-
- #define F float
- #define U64 uint64_t
- #define U32 uint32_t
- #define I32 int32_t
- #define U16 uint16_t
- #define U8 uint8_t
-
- #define F0 0.0f
- #define F1 1.0f
+ using F = float;
+ using U64 = uint64_t;
+ using U32 = uint32_t;
+ using I32 = int32_t;
+ using U16 = uint16_t;
+ using U8 = uint8_t;
#elif defined(__AVX512F__)
#define N 16
-
- #define F Fx16
- #define U64 U64x16
- #define U32 U32x16
- #define I32 I32x16
- #define U16 U16x16
- #define U8 U8x16
-
- #define F0 F{0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}
- #define F1 F{1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
#elif defined(__AVX__)
#define N 8
-
- #define F Fx8
- #define U64 U64x8
- #define U32 U32x8
- #define I32 I32x8
- #define U16 U16x8
- #define U8 U8x8
-
- #define F0 F{0,0,0,0, 0,0,0,0}
- #define F1 F{1,1,1,1, 1,1,1,1}
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
#else
#define N 4
-
- #define F Fx4
- #define U64 U64x4
- #define U32 U32x4
- #define I32 I32x4
- #define U16 U16x4
- #define U8 U8x4
-
- #define F0 F{0,0,0,0}
- #define F1 F{1,1,1,1}
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
#endif
-#define NS(id) id
-#define ATTR
+ #define ATTR
#include "src/Transform_inl.h"
-#undef N
-#undef F
-#undef U64
-#undef U32
-#undef I32
-#undef U16
-#undef U8
-#undef F0
-#undef F1
-#undef NS
-#undef ATTR
+ #undef N
+ #undef ATTR
+}
// Now, instantiate any other versions of run_program() we may want for runtime detection.
#if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \
&& defined(__x86_64__) && !defined(__AVX2__)
- #define N 8
- #define F Fx8
- #define U64 U64x8
- #define U32 U32x8
- #define I32 I32x8
- #define U16 U16x8
- #define U8 U8x8
- #define F0 F{0,0,0,0, 0,0,0,0}
- #define F1 F{1,1,1,1, 1,1,1,1}
-
- #define NS(id) id ## _hsw
- #define ATTR __attribute__((target("avx2,f16c")))
-
- // We check these guards to see if we have support for these features.
- // They're likely _not_ defined here in our baseline build config.
- #ifndef __AVX__
- #define __AVX__ 1
- #define UNDEF_AVX
- #endif
- #ifndef __F16C__
- #define __F16C__ 1
- #define UNDEF_F16C
- #endif
- #ifndef __AVX2__
- #define __AVX2__ 1
- #define UNDEF_AVX2
- #endif
-
- #include "src/Transform_inl.h"
-
- #undef N
- #undef F
- #undef U64
- #undef U32
- #undef I32
- #undef U16
- #undef U8
- #undef F0
- #undef F1
- #undef NS
- #undef ATTR
- #ifdef UNDEF_AVX
- #undef __AVX__
- #undef UNDEF_AVX
- #endif
- #ifdef UNDEF_F16C
- #undef __F16C__
- #undef UNDEF_F16C
- #endif
- #ifdef UNDEF_AVX2
- #undef __AVX2__
- #undef UNDEF_AVX2
- #endif
+ namespace hsw {
+ #define N 8
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
+
+ #define ATTR __attribute__((target("avx2,f16c")))
+
+ // We check these guards to see if we have support for these features.
+ // They're likely _not_ defined here in our baseline build config.
+ #ifndef __AVX__
+ #define __AVX__ 1
+ #define UNDEF_AVX
+ #endif
+ #ifndef __F16C__
+ #define __F16C__ 1
+ #define UNDEF_F16C
+ #endif
+ #ifndef __AVX2__
+ #define __AVX2__ 1
+ #define UNDEF_AVX2
+ #endif
+
+ #include "src/Transform_inl.h"
+
+ #undef N
+ #undef ATTR
+
+ #ifdef UNDEF_AVX
+ #undef __AVX__
+ #undef UNDEF_AVX
+ #endif
+ #ifdef UNDEF_F16C
+ #undef __F16C__
+ #undef UNDEF_F16C
+ #endif
+ #ifdef UNDEF_AVX2
+ #undef __AVX2__
+ #undef UNDEF_AVX2
+ #endif
+ }
#define TEST_FOR_HSW
@@ -2319,11 +2259,9 @@ bool skcms_Transform(const void* src,
case skcms_PixelFormat_RGBA_ffff >> 1: *ops++ = Op_store_ffff; break;
}
- void (*run)(const Op*, const void**, const char*, char*, int, size_t,size_t) = run_program;
+ auto run = baseline::run_program;
#if defined(TEST_FOR_HSW)
- if (hsw_ok()) {
- run = run_program_hsw;
- }
+ if (hsw_ok()) { run = hsw::run_program; }
#endif
run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
return true;
diff --git a/third_party/skcms/src/Transform_inl.h b/third_party/skcms/src/Transform_inl.h
index 09183bfd42..4d09fed750 100644
--- a/third_party/skcms/src/Transform_inl.h
+++ b/third_party/skcms/src/Transform_inl.h
@@ -7,30 +7,28 @@
// Intentionally NO #pragma once... included multiple times.
-// This file is included from skcms.c with some values and types pre-defined:
+// This file is included from skcms.cc with some pre-defined macros:
// N: depth of all vectors, 1,4,8, or 16
-//
+// ATTR: an __attribute__ to apply to functions
+// and inside a namespace, with some types already defined:
// F: a vector of N float
// I32: a vector of N int32_t
// U64: a vector of N uint64_t
// U32: a vector of N uint32_t
// U16: a vector of N uint16_t
// U8: a vector of N uint8_t
-//
-// F0: a vector of N floats set to zero
-// F1: a vector of N floats set to one
-//
-// NS(id): a macro that returns unique identifiers
-// ATTR: an __attribute__ to apply to functions
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
// TODO(mtklein): this build supports FP16 compute
#endif
-#if defined(__ARM_NEON)
- #include <arm_neon.h>
-#elif defined(__SSE__)
- #include <immintrin.h>
+#if defined(__GNUC__) && !defined(__clang__)
+ // Once again, GCC is kind of weird, not allowing vector = scalar directly.
+ static constexpr F F0 = F() + 0.0f,
+ F1 = F() + 1.0f;
+#else
+ static constexpr F F0 = 0.0f,
+ F1 = 1.0f;
#endif
#if N == 4 && defined(__ARM_NEON)
@@ -83,8 +81,7 @@
// When we convert from float to fixed point, it's very common to want to round,
// and for some reason compilers generate better code when converting to int32_t.
// To serve both those ends, we use this function to_fixed() instead of direct CASTs.
-SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); }
-#define to_fixed NS(to_fixed_)
+SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); }
// Comparisons result in bool when N == 1, in an I32 mask when N > 1.
// We've made this a macro so it can be type-generic...
@@ -96,23 +93,23 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); }
#endif
#if defined(USING_NEON_F16C)
- SI ATTR F NS(F_from_Half_(U16 half)) { return vcvt_f32_f16((float16x4_t)half); }
- SI ATTR U16 NS(Half_from_F_(F f)) { return (U16)vcvt_f16_f32( f); }
+ SI ATTR F F_from_Half(U16 half) { return vcvt_f32_f16((float16x4_t)half); }
+ SI ATTR U16 Half_from_F(F f) { return (U16)vcvt_f16_f32( f); }
#elif defined(__AVX512F__)
- SI ATTR F NS(F_from_Half_)(U16 half) { return (F)_mm512_cvtph_ps((__m256i)half); }
- SI ATTR U16 NS(Half_from_F_)(F f) {
+ SI ATTR F F_from_Half(U16 half) { return (F)_mm512_cvtph_ps((__m256i)half); }
+ SI ATTR U16 Half_from_F(F f) {
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
}
#elif defined(USING_AVX_F16C)
- SI ATTR F NS(F_from_Half_)(U16 half) {
+ SI ATTR F F_from_Half(U16 half) {
typedef int16_t __attribute__((vector_size(16))) I16;
return __builtin_ia32_vcvtph2ps256((I16)half);
}
- SI ATTR U16 NS(Half_from_F_)(F f) {
+ SI ATTR U16 Half_from_F(F f) {
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
}
#else
- SI ATTR F NS(F_from_Half_)(U16 half) {
+ SI ATTR F F_from_Half(U16 half) {
U32 wide = CAST(U32, half);
// A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
U32 s = wide & 0x8000,
@@ -127,7 +124,7 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); }
return (F)if_then_else(em < 0x0400, F0, norm);
}
- SI ATTR U16 NS(Half_from_F_)(F f) {
+ SI ATTR U16 Half_from_F(F f) {
// A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
U32 sem;
small_memcpy(&sem, &f, sizeof(sem));
@@ -141,36 +138,28 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); }
}
#endif
-#define F_from_Half NS(F_from_Half_)
-#define Half_from_F NS(Half_from_F_)
-
// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
#if defined(USING_NEON)
- SI ATTR U16 NS(swap_endian_16_)(U16 v) {
+ SI ATTR U16 swap_endian_16(U16 v) {
return (U16)vrev16_u8((uint8x8_t) v);
}
- #define swap_endian_16 NS(swap_endian_16_)
#endif
// Passing by U64* instead of U64 avoids ABI warnings. It's all moot when inlined.
-SI ATTR void NS(swap_endian_16x4_)(U64* rgba) {
+SI ATTR void swap_endian_16x4(U64* rgba) {
*rgba = (*rgba & 0x00ff00ff00ff00ff) << 8
| (*rgba & 0xff00ff00ff00ff00) >> 8;
}
-#define swap_endian_16x4 NS(swap_endian_16x4_)
#if defined(USING_NEON)
- SI ATTR F NS(min__)(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
- SI ATTR F NS(max__)(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+ SI ATTR F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+ SI ATTR F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
#else
- SI ATTR F NS(min__)(F x, F y) { return (F)if_then_else(x > y, y, x); }
- SI ATTR F NS(max__)(F x, F y) { return (F)if_then_else(x < y, y, x); }
+ SI ATTR F min_(F x, F y) { return (F)if_then_else(x > y, y, x); }
+ SI ATTR F max_(F x, F y) { return (F)if_then_else(x < y, y, x); }
#endif
-#define min_ NS(min__)
-#define max_ NS(max__)
-
-SI ATTR F NS(floor__)(F x) {
+SI ATTR F floor_(F x) {
#if N == 1
return floorf_(x);
#elif defined(__aarch64__)
@@ -191,9 +180,8 @@ SI ATTR F NS(floor__)(F x) {
// the range an integer can represent. We expect most x to be small.
#endif
}
-#define floor_ NS(floor__)
-SI ATTR F NS(approx_log2_)(F x) {
+SI ATTR F approx_log2(F x) {
// The first approximation of log2(x) is its exponent 'e', minus 127.
I32 bits;
small_memcpy(&bits, &x, sizeof(bits));
@@ -209,9 +197,8 @@ SI ATTR F NS(approx_log2_)(F x) {
- 1.498030302f*m
- 1.725879990f/(0.3520887068f + m);
}
-#define approx_log2 NS(approx_log2_)
-SI ATTR F NS(approx_exp2_)(F x) {
+SI ATTR F approx_exp2(F x) {
F fract = x - floor_(x);
I32 bits = CAST(I32, (1.0f * (1<<23)) * (x + 121.274057500f
@@ -220,16 +207,14 @@ SI ATTR F NS(approx_exp2_)(F x) {
small_memcpy(&x, &bits, sizeof(x));
return x;
}
-#define approx_exp2 NS(approx_exp2_)
-SI ATTR F NS(approx_pow_)(F x, float y) {
+SI ATTR F approx_pow(F x, float y) {
return (F)if_then_else((x == F0) | (x == F1), x
, approx_exp2(approx_log2(x) * y));
}
-#define approx_pow NS(approx_pow_)
// Return tf(x).
-SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) {
+SI ATTR F apply_tf(const skcms_TransferFunction* tf, F x) {
F sign = (F)if_then_else(x < 0, -F1, F1);
x *= sign;
@@ -238,7 +223,6 @@ SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) {
return sign * (F)if_then_else(x < tf->d, linear, nonlinear);
}
-#define apply_tf NS(apply_tf_)
// Strided loads and stores of N values, starting from p.
#if N == 1
@@ -283,7 +267,7 @@ SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) {
(p)[48] = (v)[12]; (p)[52] = (v)[13]; (p)[56] = (v)[14]; (p)[60] = (v)[15]
#endif
-SI ATTR U8 NS(gather_8_)(const uint8_t* p, I32 ix) {
+SI ATTR U8 gather_8(const uint8_t* p, I32 ix) {
#if N == 1
U8 v = p[ix];
#elif N == 4
@@ -299,17 +283,15 @@ SI ATTR U8 NS(gather_8_)(const uint8_t* p, I32 ix) {
#endif
return v;
}
-#define gather_8 NS(gather_8_)
// Helper for gather_16(), loading the ix'th 16-bit value from p.
-SI ATTR uint16_t NS(load_16_)(const uint8_t* p, int ix) {
+SI ATTR uint16_t load_16(const uint8_t* p, int ix) {
uint16_t v;
small_memcpy(&v, p + 2*ix, 2);
return v;
}
-#define load_16 NS(load_16_)
-SI ATTR U16 NS(gather_16_)(const uint8_t* p, I32 ix) {
+SI ATTR U16 gather_16(const uint8_t* p, I32 ix) {
#if N == 1
U16 v = load_16(p,ix);
#elif N == 4
@@ -325,25 +307,22 @@ SI ATTR U16 NS(gather_16_)(const uint8_t* p, I32 ix) {
#endif
return v;
}
-#define gather_16 NS(gather_16_)
#if !defined(__AVX2__)
// Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes.
- SI ATTR uint32_t NS(load_24_32_)(const uint8_t* p, int ix) {
+ SI ATTR uint32_t load_24_32(const uint8_t* p, int ix) {
uint32_t v;
small_memcpy(&v, p + 3*ix, 4);
return v;
}
- SI ATTR uint64_t NS(load_48_64_)(const uint8_t* p, int ix) {
+ SI ATTR uint64_t load_48_64(const uint8_t* p, int ix) {
uint64_t v;
small_memcpy(&v, p + 6*ix, 8);
return v;
}
- #define load_24_32 NS(load_24_32_)
- #define load_48_64 NS(load_48_64_)
#endif
-SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) {
+SI ATTR U32 gather_24(const uint8_t* p, I32 ix) {
// First, back up a byte. Any place we're gathering from has a safe junk byte to read
// in front of it, either a previous table value, or some tag metadata.
p -= 1;
@@ -379,10 +358,9 @@ SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) {
// Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
return v >> 8;
}
-#define gather_24 NS(gather_24_)
#if !defined(__arm__)
- SI ATTR void NS(gather_48_)(const uint8_t* p, I32 ix, U64* v) {
+ SI ATTR void gather_48(const uint8_t* p, I32 ix, U64* v) {
// As in gather_24(), with everything doubled.
p -= 2;
@@ -433,32 +411,28 @@ SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) {
*v >>= 16;
}
- #define gather_48 NS(gather_48_)
#endif
-SI ATTR F NS(F_from_U8_)(U8 v) {
+SI ATTR F F_from_U8(U8 v) {
return CAST(F, v) * (1/255.0f);
}
-#define F_from_U8 NS(F_from_U8_)
-SI ATTR F NS(F_from_U16_BE_)(U16 v) {
+SI ATTR F F_from_U16_BE(U16 v) {
// All 16-bit ICC values are big-endian, so we byte swap before converting to float.
// MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
v = (U16)( ((v<<8)|(v>>8)) & 0xffff );
return CAST(F, v) * (1/65535.0f);
}
-#define F_from_U16_BE NS(F_from_U16_BE_)
-SI ATTR F NS(minus_1_ulp_)(F v) {
+SI ATTR F minus_1_ulp(F v) {
I32 bits;
small_memcpy(&bits, &v, sizeof(bits));
bits = bits - 1;
small_memcpy(&v, &bits, sizeof(bits));
return v;
}
-#define minus_1_ulp NS(minus_1_ulp_)
-SI ATTR F NS(table_8_)(const skcms_Curve* curve, F v) {
+SI ATTR F table_8(const skcms_Curve* curve, F v) {
// Clamp the input to [0,1], then scale to a table index.
F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
@@ -476,7 +450,7 @@ SI ATTR F NS(table_8_)(const skcms_Curve* curve, F v) {
return l + (h-l)*t;
}
-SI ATTR F NS(table_16_)(const skcms_Curve* curve, F v) {
+SI ATTR F table_16(const skcms_Curve* curve, F v) {
// All just as in table_8() until the gathers.
F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
@@ -492,7 +466,7 @@ SI ATTR F NS(table_16_)(const skcms_Curve* curve, F v) {
}
// Color lookup tables, by input dimension and bit depth.
-SI ATTR void NS(clut_0_8_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI ATTR void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
U32 rgb = gather_24(a2b->grid_8, ix);
*r = CAST(F, (rgb >> 0) & 0xff) * (1/255.0f);
@@ -502,7 +476,7 @@ SI ATTR void NS(clut_0_8_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g,
(void)a;
(void)stride;
}
-SI ATTR void NS(clut_0_16_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
#if defined(__arm__)
// This is up to 2x faster on 32-bit ARM than the #else-case fast path.
*r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
@@ -532,28 +506,28 @@ SI ATTR void NS(clut_0_16_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g
// These are all the same basic approach: handle one dimension, then the rest recursively.
// We let "I" be the current dimension, and "J" the previous dimension, I-1. "B" is the bit depth.
-#define DEF_CLUT(I,J,B) \
- MAYBE_SI ATTR \
- void NS(clut_##I##_##B##_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
- I32 limit = CAST(I32, F0); \
- limit += a2b->grid_points[I-1]; \
- \
- const F* srcs[] = { r,g,b,&a }; \
- F src = *srcs[I-1]; \
- \
- F x = max_(F0, min_(src, F1)) * CAST(F, limit - 1); \
- \
- I32 lo = CAST(I32, x ), \
- hi = CAST(I32, minus_1_ulp(x+1.0f)); \
- F lr = *r, lg = *g, lb = *b, \
- hr = *r, hg = *g, hb = *b; \
- NS(clut_##J##_##B##_)(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \
- NS(clut_##J##_##B##_)(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \
- \
- F t = x - CAST(F, lo); \
- *r = lr + (hr-lr)*t; \
- *g = lg + (hg-lg)*t; \
- *b = lb + (hb-lb)*t; \
+#define DEF_CLUT(I,J,B) \
+ MAYBE_SI ATTR \
+ void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
+ I32 limit = CAST(I32, F0); \
+ limit += a2b->grid_points[I-1]; \
+ \
+ const F* srcs[] = { r,g,b,&a }; \
+ F src = *srcs[I-1]; \
+ \
+ F x = max_(F0, min_(src, F1)) * CAST(F, limit - 1); \
+ \
+ I32 lo = CAST(I32, x ), \
+ hi = CAST(I32, minus_1_ulp(x+1.0f)); \
+ F lr = *r, lg = *g, lb = *b, \
+ hr = *r, hg = *g, hb = *b; \
+ clut_##J##_##B(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \
+ clut_##J##_##B(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \
+ \
+ F t = x - CAST(F, lo); \
+ *r = lr + (hr-lr)*t; \
+ *g = lg + (hg-lg)*t; \
+ *b = lb + (hb-lb)*t; \
}
DEF_CLUT(1,0,8)
@@ -567,8 +541,8 @@ DEF_CLUT(3,2,16)
DEF_CLUT(4,3,16)
ATTR
-static void NS(exec_ops)(const Op* ops, const void** args,
- const char* src, char* dst, int i) {
+static void exec_ops(const Op* ops, const void** args,
+ const char* src, char* dst, int i) {
F r = F0, g = F0, b = F0, a = F0;
while (true) {
switch (*ops++) {
@@ -863,36 +837,36 @@ static void NS(exec_ops)(const Op* ops, const void** args,
case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
- case Op_table_8_r: { r = NS(table_8_ )((const skcms_Curve*)*args++, r); } break;
- case Op_table_8_g: { g = NS(table_8_ )((const skcms_Curve*)*args++, g); } break;
- case Op_table_8_b: { b = NS(table_8_ )((const skcms_Curve*)*args++, b); } break;
- case Op_table_8_a: { a = NS(table_8_ )((const skcms_Curve*)*args++, a); } break;
+ case Op_table_8_r: { r = table_8((const skcms_Curve*)*args++, r); } break;
+ case Op_table_8_g: { g = table_8((const skcms_Curve*)*args++, g); } break;
+ case Op_table_8_b: { b = table_8((const skcms_Curve*)*args++, b); } break;
+ case Op_table_8_a: { a = table_8((const skcms_Curve*)*args++, a); } break;
- case Op_table_16_r:{ r = NS(table_16_)((const skcms_Curve*)*args++, r); } break;
- case Op_table_16_g:{ g = NS(table_16_)((const skcms_Curve*)*args++, g); } break;
- case Op_table_16_b:{ b = NS(table_16_)((const skcms_Curve*)*args++, b); } break;
- case Op_table_16_a:{ a = NS(table_16_)((const skcms_Curve*)*args++, a); } break;
+ case Op_table_16_r:{ r = table_16((const skcms_Curve*)*args++, r); } break;
+ case Op_table_16_g:{ g = table_16((const skcms_Curve*)*args++, g); } break;
+ case Op_table_16_b:{ b = table_16((const skcms_Curve*)*args++, b); } break;
+ case Op_table_16_a:{ a = table_16((const skcms_Curve*)*args++, a); } break;
case Op_clut_3D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- NS(clut_3_8_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
+ clut_3_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
} break;
case Op_clut_3D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- NS(clut_3_16_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
+ clut_3_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
} break;
case Op_clut_4D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- NS(clut_4_8_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
+ clut_4_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;
case Op_clut_4D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- NS(clut_4_16_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
+ clut_4_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;
@@ -1098,12 +1072,12 @@ static void NS(exec_ops)(const Op* ops, const void** args,
}
ATTR
-static void NS(run_program)(const Op* program, const void** arguments,
- const char* src, char* dst, int n,
- const size_t src_bpp, const size_t dst_bpp) {
+static void run_program(const Op* program, const void** arguments,
+ const char* src, char* dst, int n,
+ const size_t src_bpp, const size_t dst_bpp) {
int i = 0;
while (n >= N) {
- NS(exec_ops)(program, arguments, src, dst, i);
+ exec_ops(program, arguments, src, dst, i);
i += N;
n -= N;
}
@@ -1112,7 +1086,7 @@ static void NS(run_program)(const Op* program, const void** arguments,
tmp_dst[4*4*N] = {0};
memcpy(tmp_src, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
- NS(exec_ops)(program, arguments, tmp_src, tmp_dst, 0);
+ exec_ops(program, arguments, tmp_src, tmp_dst, 0);
memcpy((char*)dst + (size_t)i*dst_bpp, tmp_dst, (size_t)n*dst_bpp);
}
}
diff --git a/third_party/skcms/version.sha1 b/third_party/skcms/version.sha1
index 746a312baf..5554bc6e81 100755
--- a/third_party/skcms/version.sha1
+++ b/third_party/skcms/version.sha1
@@ -1 +1 @@
-23e7777f421d11992e8245771641141aa273556b \ No newline at end of file
+51fba282d9a06baa9f8ed2e3d679c5f439420322 \ No newline at end of file