From 4241bab68aa2206a7be63d76411d1f9a39aa4dab Mon Sep 17 00:00:00 2001 From: "skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com" Date: Mon, 30 Jul 2018 13:00:30 +0000 Subject: Roll skia/third_party/skcms 23e7777f421d..51fba282d9a0 (1 commits) https://skia.googlesource.com/skcms.git/+log/23e7777f421d..51fba282d9a0 2018-07-30 mtklein@google.com start cleaning up src/Transform_inl.h with C++ The AutoRoll server is located here: https://skcms-skia-roll.skia.org Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=master.tryserver.blink:linux_trusty_blink_rel TBR=scroggo@google.com Change-Id: I3005cfe39088dae5564a8ddd9fffd6eefd122065 Reviewed-on: https://skia-review.googlesource.com/144296 Reviewed-by: skcms-skia-autoroll Commit-Queue: skcms-skia-autoroll --- third_party/skcms/skcms.cc | 238 +++++++++++++--------------------- third_party/skcms/src/Transform_inl.h | 196 ++++++++++++---------------- third_party/skcms/version.sha1 | 2 +- 3 files changed, 174 insertions(+), 262 deletions(-) diff --git a/third_party/skcms/skcms.cc b/third_party/skcms/skcms.cc index 0a820629ad..3595088633 100644 --- a/third_party/skcms/skcms.cc +++ b/third_party/skcms/skcms.cc @@ -13,6 +13,12 @@ #include #include +#if defined(__ARM_NEON) + #include +#elif defined(__SSE__) + #include +#endif + // sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others. // We have better testing on 64-bit machines, so force 32-bit machines to behave like 64-bit. // @@ -1806,174 +1812,108 @@ typedef enum { #endif #if defined(__clang__) - typedef float __attribute__((ext_vector_type(4))) Fx4; - typedef int32_t __attribute__((ext_vector_type(4))) I32x4; - typedef uint64_t __attribute__((ext_vector_type(4))) U64x4; - typedef uint32_t __attribute__((ext_vector_type(4))) U32x4; - typedef uint16_t __attribute__((ext_vector_type(4))) U16x4; - typedef uint8_t __attribute__((ext_vector_type(4))) U8x4; - - typedef float __attribute__((ext_vector_type(8))) Fx8; - typedef int32_t __attribute__((ext_vector_type(8))) I32x8; - typedef uint64_t __attribute__((ext_vector_type(8))) U64x8; - typedef uint32_t __attribute__((ext_vector_type(8))) U32x8; - typedef uint16_t __attribute__((ext_vector_type(8))) U16x8; - typedef uint8_t __attribute__((ext_vector_type(8))) U8x8; - - typedef float __attribute__((ext_vector_type(16))) Fx16; - typedef int32_t __attribute__((ext_vector_type(16))) I32x16; - typedef uint64_t __attribute__((ext_vector_type(16))) U64x16; - typedef uint32_t __attribute__((ext_vector_type(16))) U32x16; - typedef uint16_t __attribute__((ext_vector_type(16))) U16x16; - typedef uint8_t __attribute__((ext_vector_type(16))) U8x16; + template using Vec = T __attribute__((ext_vector_type(N))); #elif defined(__GNUC__) - typedef float __attribute__((vector_size(16))) Fx4; - typedef int32_t __attribute__((vector_size(16))) I32x4; - typedef uint64_t __attribute__((vector_size(32))) U64x4; - typedef uint32_t __attribute__((vector_size(16))) U32x4; - typedef uint16_t __attribute__((vector_size( 8))) U16x4; - typedef uint8_t __attribute__((vector_size( 4))) U8x4; - - typedef float __attribute__((vector_size(32))) Fx8; - typedef int32_t __attribute__((vector_size(32))) I32x8; - typedef uint64_t __attribute__((vector_size(64))) U64x8; - typedef uint32_t __attribute__((vector_size(32))) U32x8; - typedef uint16_t __attribute__((vector_size(16))) U16x8; - typedef uint8_t __attribute__((vector_size( 8))) U8x8; - - typedef float __attribute__((vector_size( 64))) Fx16; - typedef int32_t __attribute__((vector_size( 64))) I32x16; - typedef uint64_t __attribute__((vector_size(128))) U64x16; - typedef uint32_t __attribute__((vector_size( 64))) U32x16; - typedef uint16_t __attribute__((vector_size( 32))) U16x16; - typedef uint8_t __attribute__((vector_size( 16))) U8x16; + // For some reason GCC accepts this nonsense, but not the more straightforward version, + // template using Vec = T __attribute__((vector_size(N*sizeof(T)))); + template + struct VecHelper { typedef T __attribute__((vector_size(N*sizeof(T)))) V; }; + + template using Vec = typename VecHelper::V; #endif // First, instantiate our default exec_ops() implementation using the default compiliation target. +namespace baseline { #if defined(SKCMS_PORTABLE) || !(defined(__clang__) || defined(__GNUC__)) #define N 1 - - #define F float - #define U64 uint64_t - #define U32 uint32_t - #define I32 int32_t - #define U16 uint16_t - #define U8 uint8_t - - #define F0 0.0f - #define F1 1.0f + using F = float; + using U64 = uint64_t; + using U32 = uint32_t; + using I32 = int32_t; + using U16 = uint16_t; + using U8 = uint8_t; #elif defined(__AVX512F__) #define N 16 - - #define F Fx16 - #define U64 U64x16 - #define U32 U32x16 - #define I32 I32x16 - #define U16 U16x16 - #define U8 U8x16 - - #define F0 F{0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} - #define F1 F{1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1} + using F = Vec; + using I32 = Vec; + using U64 = Vec; + using U32 = Vec; + using U16 = Vec; + using U8 = Vec; #elif defined(__AVX__) #define N 8 - - #define F Fx8 - #define U64 U64x8 - #define U32 U32x8 - #define I32 I32x8 - #define U16 U16x8 - #define U8 U8x8 - - #define F0 F{0,0,0,0, 0,0,0,0} - #define F1 F{1,1,1,1, 1,1,1,1} + using F = Vec; + using I32 = Vec; + using U64 = Vec; + using U32 = Vec; + using U16 = Vec; + using U8 = Vec; #else #define N 4 - - #define F Fx4 - #define U64 U64x4 - #define U32 U32x4 - #define I32 I32x4 - #define U16 U16x4 - #define U8 U8x4 - - #define F0 F{0,0,0,0} - #define F1 F{1,1,1,1} + using F = Vec; + using I32 = Vec; + using U64 = Vec; + using U32 = Vec; + using U16 = Vec; + using U8 = Vec; #endif -#define NS(id) id -#define ATTR + #define ATTR #include "src/Transform_inl.h" -#undef N -#undef F -#undef U64 -#undef U32 -#undef I32 -#undef U16 -#undef U8 -#undef F0 -#undef F1 -#undef NS -#undef ATTR + #undef N + #undef ATTR +} // Now, instantiate any other versions of run_program() we may want for runtime detection. #if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \ && defined(__x86_64__) && !defined(__AVX2__) - #define N 8 - #define F Fx8 - #define U64 U64x8 - #define U32 U32x8 - #define I32 I32x8 - #define U16 U16x8 - #define U8 U8x8 - #define F0 F{0,0,0,0, 0,0,0,0} - #define F1 F{1,1,1,1, 1,1,1,1} - - #define NS(id) id ## _hsw - #define ATTR __attribute__((target("avx2,f16c"))) - - // We check these guards to see if we have support for these features. - // They're likely _not_ defined here in our baseline build config. - #ifndef __AVX__ - #define __AVX__ 1 - #define UNDEF_AVX - #endif - #ifndef __F16C__ - #define __F16C__ 1 - #define UNDEF_F16C - #endif - #ifndef __AVX2__ - #define __AVX2__ 1 - #define UNDEF_AVX2 - #endif - - #include "src/Transform_inl.h" - - #undef N - #undef F - #undef U64 - #undef U32 - #undef I32 - #undef U16 - #undef U8 - #undef F0 - #undef F1 - #undef NS - #undef ATTR - #ifdef UNDEF_AVX - #undef __AVX__ - #undef UNDEF_AVX - #endif - #ifdef UNDEF_F16C - #undef __F16C__ - #undef UNDEF_F16C - #endif - #ifdef UNDEF_AVX2 - #undef __AVX2__ - #undef UNDEF_AVX2 - #endif + namespace hsw { + #define N 8 + using F = Vec; + using I32 = Vec; + using U64 = Vec; + using U32 = Vec; + using U16 = Vec; + using U8 = Vec; + + #define ATTR __attribute__((target("avx2,f16c"))) + + // We check these guards to see if we have support for these features. + // They're likely _not_ defined here in our baseline build config. + #ifndef __AVX__ + #define __AVX__ 1 + #define UNDEF_AVX + #endif + #ifndef __F16C__ + #define __F16C__ 1 + #define UNDEF_F16C + #endif + #ifndef __AVX2__ + #define __AVX2__ 1 + #define UNDEF_AVX2 + #endif + + #include "src/Transform_inl.h" + + #undef N + #undef ATTR + + #ifdef UNDEF_AVX + #undef __AVX__ + #undef UNDEF_AVX + #endif + #ifdef UNDEF_F16C + #undef __F16C__ + #undef UNDEF_F16C + #endif + #ifdef UNDEF_AVX2 + #undef __AVX2__ + #undef UNDEF_AVX2 + #endif + } #define TEST_FOR_HSW @@ -2319,11 +2259,9 @@ bool skcms_Transform(const void* src, case skcms_PixelFormat_RGBA_ffff >> 1: *ops++ = Op_store_ffff; break; } - void (*run)(const Op*, const void**, const char*, char*, int, size_t,size_t) = run_program; + auto run = baseline::run_program; #if defined(TEST_FOR_HSW) - if (hsw_ok()) { - run = run_program_hsw; - } + if (hsw_ok()) { run = hsw::run_program; } #endif run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp); return true; diff --git a/third_party/skcms/src/Transform_inl.h b/third_party/skcms/src/Transform_inl.h index 09183bfd42..4d09fed750 100644 --- a/third_party/skcms/src/Transform_inl.h +++ b/third_party/skcms/src/Transform_inl.h @@ -7,30 +7,28 @@ // Intentionally NO #pragma once... included multiple times. -// This file is included from skcms.c with some values and types pre-defined: +// This file is included from skcms.cc with some pre-defined macros: // N: depth of all vectors, 1,4,8, or 16 -// +// ATTR: an __attribute__ to apply to functions +// and inside a namespace, with some types already defined: // F: a vector of N float // I32: a vector of N int32_t // U64: a vector of N uint64_t // U32: a vector of N uint32_t // U16: a vector of N uint16_t // U8: a vector of N uint8_t -// -// F0: a vector of N floats set to zero -// F1: a vector of N floats set to one -// -// NS(id): a macro that returns unique identifiers -// ATTR: an __attribute__ to apply to functions #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // TODO(mtklein): this build supports FP16 compute #endif -#if defined(__ARM_NEON) - #include -#elif defined(__SSE__) - #include +#if defined(__GNUC__) && !defined(__clang__) + // Once again, GCC is kind of weird, not allowing vector = scalar directly. + static constexpr F F0 = F() + 0.0f, + F1 = F() + 1.0f; +#else + static constexpr F F0 = 0.0f, + F1 = 1.0f; #endif #if N == 4 && defined(__ARM_NEON) @@ -83,8 +81,7 @@ // When we convert from float to fixed point, it's very common to want to round, // and for some reason compilers generate better code when converting to int32_t. // To serve both those ends, we use this function to_fixed() instead of direct CASTs. -SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); } -#define to_fixed NS(to_fixed_) +SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); } // Comparisons result in bool when N == 1, in an I32 mask when N > 1. // We've made this a macro so it can be type-generic... @@ -96,23 +93,23 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); } #endif #if defined(USING_NEON_F16C) - SI ATTR F NS(F_from_Half_(U16 half)) { return vcvt_f32_f16((float16x4_t)half); } - SI ATTR U16 NS(Half_from_F_(F f)) { return (U16)vcvt_f16_f32( f); } + SI ATTR F F_from_Half(U16 half) { return vcvt_f32_f16((float16x4_t)half); } + SI ATTR U16 Half_from_F(F f) { return (U16)vcvt_f16_f32( f); } #elif defined(__AVX512F__) - SI ATTR F NS(F_from_Half_)(U16 half) { return (F)_mm512_cvtph_ps((__m256i)half); } - SI ATTR U16 NS(Half_from_F_)(F f) { + SI ATTR F F_from_Half(U16 half) { return (F)_mm512_cvtph_ps((__m256i)half); } + SI ATTR U16 Half_from_F(F f) { return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION ); } #elif defined(USING_AVX_F16C) - SI ATTR F NS(F_from_Half_)(U16 half) { + SI ATTR F F_from_Half(U16 half) { typedef int16_t __attribute__((vector_size(16))) I16; return __builtin_ia32_vcvtph2ps256((I16)half); } - SI ATTR U16 NS(Half_from_F_)(F f) { + SI ATTR U16 Half_from_F(F f) { return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/); } #else - SI ATTR F NS(F_from_Half_)(U16 half) { + SI ATTR F F_from_Half(U16 half) { U32 wide = CAST(U32, half); // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias. U32 s = wide & 0x8000, @@ -127,7 +124,7 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); } return (F)if_then_else(em < 0x0400, F0, norm); } - SI ATTR U16 NS(Half_from_F_)(F f) { + SI ATTR U16 Half_from_F(F f) { // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias. U32 sem; small_memcpy(&sem, &f, sizeof(sem)); @@ -141,36 +138,28 @@ SI ATTR I32 NS(to_fixed_)(F f) { return CAST(I32, f + 0.5f); } } #endif -#define F_from_Half NS(F_from_Half_) -#define Half_from_F NS(Half_from_F_) - // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian. #if defined(USING_NEON) - SI ATTR U16 NS(swap_endian_16_)(U16 v) { + SI ATTR U16 swap_endian_16(U16 v) { return (U16)vrev16_u8((uint8x8_t) v); } - #define swap_endian_16 NS(swap_endian_16_) #endif // Passing by U64* instead of U64 avoids ABI warnings. It's all moot when inlined. -SI ATTR void NS(swap_endian_16x4_)(U64* rgba) { +SI ATTR void swap_endian_16x4(U64* rgba) { *rgba = (*rgba & 0x00ff00ff00ff00ff) << 8 | (*rgba & 0xff00ff00ff00ff00) >> 8; } -#define swap_endian_16x4 NS(swap_endian_16x4_) #if defined(USING_NEON) - SI ATTR F NS(min__)(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); } - SI ATTR F NS(max__)(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); } + SI ATTR F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); } + SI ATTR F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); } #else - SI ATTR F NS(min__)(F x, F y) { return (F)if_then_else(x > y, y, x); } - SI ATTR F NS(max__)(F x, F y) { return (F)if_then_else(x < y, y, x); } + SI ATTR F min_(F x, F y) { return (F)if_then_else(x > y, y, x); } + SI ATTR F max_(F x, F y) { return (F)if_then_else(x < y, y, x); } #endif -#define min_ NS(min__) -#define max_ NS(max__) - -SI ATTR F NS(floor__)(F x) { +SI ATTR F floor_(F x) { #if N == 1 return floorf_(x); #elif defined(__aarch64__) @@ -191,9 +180,8 @@ SI ATTR F NS(floor__)(F x) { // the range an integer can represent. We expect most x to be small. #endif } -#define floor_ NS(floor__) -SI ATTR F NS(approx_log2_)(F x) { +SI ATTR F approx_log2(F x) { // The first approximation of log2(x) is its exponent 'e', minus 127. I32 bits; small_memcpy(&bits, &x, sizeof(bits)); @@ -209,9 +197,8 @@ SI ATTR F NS(approx_log2_)(F x) { - 1.498030302f*m - 1.725879990f/(0.3520887068f + m); } -#define approx_log2 NS(approx_log2_) -SI ATTR F NS(approx_exp2_)(F x) { +SI ATTR F approx_exp2(F x) { F fract = x - floor_(x); I32 bits = CAST(I32, (1.0f * (1<<23)) * (x + 121.274057500f @@ -220,16 +207,14 @@ SI ATTR F NS(approx_exp2_)(F x) { small_memcpy(&x, &bits, sizeof(x)); return x; } -#define approx_exp2 NS(approx_exp2_) -SI ATTR F NS(approx_pow_)(F x, float y) { +SI ATTR F approx_pow(F x, float y) { return (F)if_then_else((x == F0) | (x == F1), x , approx_exp2(approx_log2(x) * y)); } -#define approx_pow NS(approx_pow_) // Return tf(x). -SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) { +SI ATTR F apply_tf(const skcms_TransferFunction* tf, F x) { F sign = (F)if_then_else(x < 0, -F1, F1); x *= sign; @@ -238,7 +223,6 @@ SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) { return sign * (F)if_then_else(x < tf->d, linear, nonlinear); } -#define apply_tf NS(apply_tf_) // Strided loads and stores of N values, starting from p. #if N == 1 @@ -283,7 +267,7 @@ SI ATTR F NS(apply_tf_)(const skcms_TransferFunction* tf, F x) { (p)[48] = (v)[12]; (p)[52] = (v)[13]; (p)[56] = (v)[14]; (p)[60] = (v)[15] #endif -SI ATTR U8 NS(gather_8_)(const uint8_t* p, I32 ix) { +SI ATTR U8 gather_8(const uint8_t* p, I32 ix) { #if N == 1 U8 v = p[ix]; #elif N == 4 @@ -299,17 +283,15 @@ SI ATTR U8 NS(gather_8_)(const uint8_t* p, I32 ix) { #endif return v; } -#define gather_8 NS(gather_8_) // Helper for gather_16(), loading the ix'th 16-bit value from p. -SI ATTR uint16_t NS(load_16_)(const uint8_t* p, int ix) { +SI ATTR uint16_t load_16(const uint8_t* p, int ix) { uint16_t v; small_memcpy(&v, p + 2*ix, 2); return v; } -#define load_16 NS(load_16_) -SI ATTR U16 NS(gather_16_)(const uint8_t* p, I32 ix) { +SI ATTR U16 gather_16(const uint8_t* p, I32 ix) { #if N == 1 U16 v = load_16(p,ix); #elif N == 4 @@ -325,25 +307,22 @@ SI ATTR U16 NS(gather_16_)(const uint8_t* p, I32 ix) { #endif return v; } -#define gather_16 NS(gather_16_) #if !defined(__AVX2__) // Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes. - SI ATTR uint32_t NS(load_24_32_)(const uint8_t* p, int ix) { + SI ATTR uint32_t load_24_32(const uint8_t* p, int ix) { uint32_t v; small_memcpy(&v, p + 3*ix, 4); return v; } - SI ATTR uint64_t NS(load_48_64_)(const uint8_t* p, int ix) { + SI ATTR uint64_t load_48_64(const uint8_t* p, int ix) { uint64_t v; small_memcpy(&v, p + 6*ix, 8); return v; } - #define load_24_32 NS(load_24_32_) - #define load_48_64 NS(load_48_64_) #endif -SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) { +SI ATTR U32 gather_24(const uint8_t* p, I32 ix) { // First, back up a byte. Any place we're gathering from has a safe junk byte to read // in front of it, either a previous table value, or some tag metadata. p -= 1; @@ -379,10 +358,9 @@ SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) { // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8). return v >> 8; } -#define gather_24 NS(gather_24_) #if !defined(__arm__) - SI ATTR void NS(gather_48_)(const uint8_t* p, I32 ix, U64* v) { + SI ATTR void gather_48(const uint8_t* p, I32 ix, U64* v) { // As in gather_24(), with everything doubled. p -= 2; @@ -433,32 +411,28 @@ SI ATTR U32 NS(gather_24_)(const uint8_t* p, I32 ix) { *v >>= 16; } - #define gather_48 NS(gather_48_) #endif -SI ATTR F NS(F_from_U8_)(U8 v) { +SI ATTR F F_from_U8(U8 v) { return CAST(F, v) * (1/255.0f); } -#define F_from_U8 NS(F_from_U8_) -SI ATTR F NS(F_from_U16_BE_)(U16 v) { +SI ATTR F F_from_U16_BE(U16 v) { // All 16-bit ICC values are big-endian, so we byte swap before converting to float. // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask. v = (U16)( ((v<<8)|(v>>8)) & 0xffff ); return CAST(F, v) * (1/65535.0f); } -#define F_from_U16_BE NS(F_from_U16_BE_) -SI ATTR F NS(minus_1_ulp_)(F v) { +SI ATTR F minus_1_ulp(F v) { I32 bits; small_memcpy(&bits, &v, sizeof(bits)); bits = bits - 1; small_memcpy(&v, &bits, sizeof(bits)); return v; } -#define minus_1_ulp NS(minus_1_ulp_) -SI ATTR F NS(table_8_)(const skcms_Curve* curve, F v) { +SI ATTR F table_8(const skcms_Curve* curve, F v) { // Clamp the input to [0,1], then scale to a table index. F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); @@ -476,7 +450,7 @@ SI ATTR F NS(table_8_)(const skcms_Curve* curve, F v) { return l + (h-l)*t; } -SI ATTR F NS(table_16_)(const skcms_Curve* curve, F v) { +SI ATTR F table_16(const skcms_Curve* curve, F v) { // All just as in table_8() until the gathers. F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); @@ -492,7 +466,7 @@ SI ATTR F NS(table_16_)(const skcms_Curve* curve, F v) { } // Color lookup tables, by input dimension and bit depth. -SI ATTR void NS(clut_0_8_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { +SI ATTR void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { U32 rgb = gather_24(a2b->grid_8, ix); *r = CAST(F, (rgb >> 0) & 0xff) * (1/255.0f); @@ -502,7 +476,7 @@ SI ATTR void NS(clut_0_8_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, (void)a; (void)stride; } -SI ATTR void NS(clut_0_16_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { +SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { #if defined(__arm__) // This is up to 2x faster on 32-bit ARM than the #else-case fast path. *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0)); @@ -532,28 +506,28 @@ SI ATTR void NS(clut_0_16_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g // These are all the same basic approach: handle one dimension, then the rest recursively. // We let "I" be the current dimension, and "J" the previous dimension, I-1. "B" is the bit depth. -#define DEF_CLUT(I,J,B) \ - MAYBE_SI ATTR \ - void NS(clut_##I##_##B##_)(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \ - I32 limit = CAST(I32, F0); \ - limit += a2b->grid_points[I-1]; \ - \ - const F* srcs[] = { r,g,b,&a }; \ - F src = *srcs[I-1]; \ - \ - F x = max_(F0, min_(src, F1)) * CAST(F, limit - 1); \ - \ - I32 lo = CAST(I32, x ), \ - hi = CAST(I32, minus_1_ulp(x+1.0f)); \ - F lr = *r, lg = *g, lb = *b, \ - hr = *r, hg = *g, hb = *b; \ - NS(clut_##J##_##B##_)(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \ - NS(clut_##J##_##B##_)(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \ - \ - F t = x - CAST(F, lo); \ - *r = lr + (hr-lr)*t; \ - *g = lg + (hg-lg)*t; \ - *b = lb + (hb-lb)*t; \ +#define DEF_CLUT(I,J,B) \ + MAYBE_SI ATTR \ + void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \ + I32 limit = CAST(I32, F0); \ + limit += a2b->grid_points[I-1]; \ + \ + const F* srcs[] = { r,g,b,&a }; \ + F src = *srcs[I-1]; \ + \ + F x = max_(F0, min_(src, F1)) * CAST(F, limit - 1); \ + \ + I32 lo = CAST(I32, x ), \ + hi = CAST(I32, minus_1_ulp(x+1.0f)); \ + F lr = *r, lg = *g, lb = *b, \ + hr = *r, hg = *g, hb = *b; \ + clut_##J##_##B(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \ + clut_##J##_##B(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \ + \ + F t = x - CAST(F, lo); \ + *r = lr + (hr-lr)*t; \ + *g = lg + (hg-lg)*t; \ + *b = lb + (hb-lb)*t; \ } DEF_CLUT(1,0,8) @@ -567,8 +541,8 @@ DEF_CLUT(3,2,16) DEF_CLUT(4,3,16) ATTR -static void NS(exec_ops)(const Op* ops, const void** args, - const char* src, char* dst, int i) { +static void exec_ops(const Op* ops, const void** args, + const char* src, char* dst, int i) { F r = F0, g = F0, b = F0, a = F0; while (true) { switch (*ops++) { @@ -863,36 +837,36 @@ static void NS(exec_ops)(const Op* ops, const void** args, case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break; case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break; - case Op_table_8_r: { r = NS(table_8_ )((const skcms_Curve*)*args++, r); } break; - case Op_table_8_g: { g = NS(table_8_ )((const skcms_Curve*)*args++, g); } break; - case Op_table_8_b: { b = NS(table_8_ )((const skcms_Curve*)*args++, b); } break; - case Op_table_8_a: { a = NS(table_8_ )((const skcms_Curve*)*args++, a); } break; + case Op_table_8_r: { r = table_8((const skcms_Curve*)*args++, r); } break; + case Op_table_8_g: { g = table_8((const skcms_Curve*)*args++, g); } break; + case Op_table_8_b: { b = table_8((const skcms_Curve*)*args++, b); } break; + case Op_table_8_a: { a = table_8((const skcms_Curve*)*args++, a); } break; - case Op_table_16_r:{ r = NS(table_16_)((const skcms_Curve*)*args++, r); } break; - case Op_table_16_g:{ g = NS(table_16_)((const skcms_Curve*)*args++, g); } break; - case Op_table_16_b:{ b = NS(table_16_)((const skcms_Curve*)*args++, b); } break; - case Op_table_16_a:{ a = NS(table_16_)((const skcms_Curve*)*args++, a); } break; + case Op_table_16_r:{ r = table_16((const skcms_Curve*)*args++, r); } break; + case Op_table_16_g:{ g = table_16((const skcms_Curve*)*args++, g); } break; + case Op_table_16_b:{ b = table_16((const skcms_Curve*)*args++, b); } break; + case Op_table_16_a:{ a = table_16((const skcms_Curve*)*args++, a); } break; case Op_clut_3D_8:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - NS(clut_3_8_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_3_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); } break; case Op_clut_3D_16:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - NS(clut_3_16_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_3_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); } break; case Op_clut_4D_8:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - NS(clut_4_8_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_4_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); // 'a' was really a CMYK K, so our output is actually opaque. a = F1; } break; case Op_clut_4D_16:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - NS(clut_4_16_)(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_4_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); // 'a' was really a CMYK K, so our output is actually opaque. a = F1; } break; @@ -1098,12 +1072,12 @@ static void NS(exec_ops)(const Op* ops, const void** args, } ATTR -static void NS(run_program)(const Op* program, const void** arguments, - const char* src, char* dst, int n, - const size_t src_bpp, const size_t dst_bpp) { +static void run_program(const Op* program, const void** arguments, + const char* src, char* dst, int n, + const size_t src_bpp, const size_t dst_bpp) { int i = 0; while (n >= N) { - NS(exec_ops)(program, arguments, src, dst, i); + exec_ops(program, arguments, src, dst, i); i += N; n -= N; } @@ -1112,7 +1086,7 @@ static void NS(run_program)(const Op* program, const void** arguments, tmp_dst[4*4*N] = {0}; memcpy(tmp_src, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp); - NS(exec_ops)(program, arguments, tmp_src, tmp_dst, 0); + exec_ops(program, arguments, tmp_src, tmp_dst, 0); memcpy((char*)dst + (size_t)i*dst_bpp, tmp_dst, (size_t)n*dst_bpp); } } diff --git a/third_party/skcms/version.sha1 b/third_party/skcms/version.sha1 index 746a312baf..5554bc6e81 100755 --- a/third_party/skcms/version.sha1 +++ b/third_party/skcms/version.sha1 @@ -1 +1 @@ -23e7777f421d11992e8245771641141aa273556b \ No newline at end of file +51fba282d9a06baa9f8ed2e3d679c5f439420322 \ No newline at end of file -- cgit v1.2.3