From 53becd1fc251df24c41fbfbdd3d692fa0f312df3 Mon Sep 17 00:00:00 2001 From: "skcms-skia-autoroll@skia-buildbots.google.com.iam.gserviceaccount.com" Date: Mon, 30 Jul 2018 15:01:30 +0000 Subject: Roll skia/third_party/skcms 779b8c7799f3..c663954a7567 (1 commits) https://skia.googlesource.com/skcms.git/+log/779b8c7799f3..c663954a7567 2018-07-30 mtklein@google.com more C++ cleanup The AutoRoll server is located here: https://skcms-skia-roll.skia.org Documentation for the AutoRoller is here: https://skia.googlesource.com/buildbot/+/master/autoroll/README.md If the roll is causing failures, please contact the current sheriff, who should be CC'd on the roll, and stop the roller if necessary. CQ_INCLUDE_TRYBOTS=master.tryserver.blink:linux_trusty_blink_rel TBR=scroggo@google.com Change-Id: Iaed540ec2429ff1a25537ac222354e6a09a4c200 Reviewed-on: https://skia-review.googlesource.com/144334 Commit-Queue: skcms-skia-autoroll Reviewed-by: skcms-skia-autoroll --- third_party/skcms/src/Transform_inl.h | 266 +++++++++++++++++----------------- third_party/skcms/version.sha1 | 2 +- 2 files changed, 136 insertions(+), 132 deletions(-) diff --git a/third_party/skcms/src/Transform_inl.h b/third_party/skcms/src/Transform_inl.h index 4d09fed750..8cb39e1f69 100644 --- a/third_party/skcms/src/Transform_inl.h +++ b/third_party/skcms/src/Transform_inl.h @@ -45,10 +45,16 @@ // These -Wvector-conversion warnings seem to trigger in very bogus situations, // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/ #if defined(USING_NEON) && defined(__clang__) - #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wvector-conversion" #endif +// GCC warns us about returning U64 on x86 because it's larger than a register. +// You'd see warnings like, "using AVX even though AVX is not enabled". +// We stifle these warnings... our helpers that return U64 are always inlined. +#if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__) + #pragma GCC diagnostic ignored "-Wpsabi" +#endif + // We tag most helper functions as SI, to enforce good code generation // but also work around what we think is a bug in GCC: when targeting 32-bit // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the @@ -64,24 +70,29 @@ #define small_memcpy memcpy #endif -// (T)v is a cast when N == 1 and a bit-pun when N>1, so we must use CAST(T, v) to actually cast. +// (T)v is a cast when N == 1 and a bit-pun when N>1, so we must use cast(v) to actually cast. #if N == 1 - #define CAST(T, v) (T)(v) + template + SI ATTR D cast(const S& v) { return (D)v; } #elif defined(__clang__) - #define CAST(T, v) __builtin_convertvector((v), T) + template + SI ATTR D cast(const S& v) { return __builtin_convertvector(v, D); } #elif N == 4 - #define CAST(T, v) T{(v)[0],(v)[1],(v)[2],(v)[3]} + template + SI ATTR D cast(const S& v) { return D{v[0],v[1],v[2],v[3]}; } #elif N == 8 - #define CAST(T, v) T{(v)[0],(v)[1],(v)[2],(v)[3], (v)[4],(v)[5],(v)[6],(v)[7]} + template + SI ATTR D cast(const S& v) { return D{v[0],v[1],v[2],v[3], v[4],v[5],v[6],v[7]}; } #elif N == 16 - #define CAST(T, v) T{(v)[0],(v)[1],(v)[ 2],(v)[ 3], (v)[ 4],(v)[ 5],(v)[ 6],(v)[ 7], \ - (v)[8],(v)[9],(v)[10],(v)[11], (v)[12],(v)[13],(v)[14],(v)[15]} + template + SI ATTR D cast(const S& v) { return D{v[0],v[1],v[ 2],v[ 3], v[ 4],v[ 5],v[ 6],v[ 7], + v[8],v[9],v[10],v[11], v[12],v[13],v[14],v[15]}; } #endif // When we convert from float to fixed point, it's very common to want to round, // and for some reason compilers generate better code when converting to int32_t. -// To serve both those ends, we use this function to_fixed() instead of direct CASTs. -SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); } +// To serve both those ends, we use this function to_fixed() instead of direct cast(). +SI ATTR I32 to_fixed(F f) { return cast(f + 0.5f); } // Comparisons result in bool when N == 1, in an I32 mask when N > 1. // We've made this a macro so it can be type-generic... @@ -110,7 +121,7 @@ SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); } } #else SI ATTR F F_from_Half(U16 half) { - U32 wide = CAST(U32, half); + U32 wide = cast(half); // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias. U32 s = wide & 0x8000, em = wide ^ s; @@ -133,7 +144,7 @@ SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); } em = sem ^ s; // For simplicity we flush denorm half floats (including all denorm floats) to zero. - return CAST(U16, (U32)if_then_else(em < 0x38800000, (U32)F0 + return cast((U32)if_then_else(em < 0x38800000, (U32)F0 , (s>>16) + (em>>13) - ((127-15)<<10))); } #endif @@ -145,10 +156,9 @@ SI ATTR I32 to_fixed(F f) { return CAST(I32, f + 0.5f); } } #endif -// Passing by U64* instead of U64 avoids ABI warnings. It's all moot when inlined. -SI ATTR void swap_endian_16x4(U64* rgba) { - *rgba = (*rgba & 0x00ff00ff00ff00ff) << 8 - | (*rgba & 0xff00ff00ff00ff00) >> 8; +SI ATTR U64 swap_endian_16x4(const U64& rgba) { + return (rgba & 0x00ff00ff00ff00ff) << 8 + | (rgba & 0xff00ff00ff00ff00) >> 8; } #if defined(USING_NEON) @@ -172,7 +182,7 @@ SI ATTR F floor_(F x) { return _mm_floor_ps(x); #else // Round trip through integers with a truncating cast. - F roundtrip = CAST(F, CAST(I32, x)); + F roundtrip = cast(cast(x)); // If x is negative, truncating gives the ceiling instead of the floor. return roundtrip - (F)if_then_else(roundtrip > x, F1, F0); @@ -186,7 +196,7 @@ SI ATTR F approx_log2(F x) { I32 bits; small_memcpy(&bits, &x, sizeof(bits)); - F e = CAST(F, bits) * (1.0f / (1<<23)); + F e = cast(bits) * (1.0f / (1<<23)); // If we use the mantissa too we can refine the error signficantly. I32 m_bits = (bits & 0x007fffff) | 0x3f000000; @@ -201,7 +211,7 @@ SI ATTR F approx_log2(F x) { SI ATTR F approx_exp2(F x) { F fract = x - floor_(x); - I32 bits = CAST(I32, (1.0f * (1<<23)) * (x + 121.274057500f + I32 bits = cast((1.0f * (1<<23)) * (x + 121.274057500f - 1.490129070f*fract + 27.728023300f/(4.84252568f - fract))); small_memcpy(&x, &bits, sizeof(x)); @@ -414,14 +424,14 @@ SI ATTR U32 gather_24(const uint8_t* p, I32 ix) { #endif SI ATTR F F_from_U8(U8 v) { - return CAST(F, v) * (1/255.0f); + return cast(v) * (1/255.0f); } SI ATTR F F_from_U16_BE(U16 v) { // All 16-bit ICC values are big-endian, so we byte swap before converting to float. // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask. v = (U16)( ((v<<8)|(v>>8)) & 0xffff ); - return CAST(F, v) * (1/65535.0f); + return cast(v) * (1/65535.0f); } SI ATTR F minus_1_ulp(F v) { @@ -437,9 +447,9 @@ SI ATTR F table_8(const skcms_Curve* curve, F v) { F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two. - I32 lo = CAST(I32, ix ), - hi = CAST(I32, minus_1_ulp(ix+1.0f)); - F t = ix - CAST(F, lo); // i.e. the fractional part of ix. + I32 lo = cast( ix ), + hi = cast(minus_1_ulp(ix+1.0f)); + F t = ix - cast(lo); // i.e. the fractional part of ix. // TODO: can we load l and h simultaneously? Each entry in 'h' is either // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe @@ -454,9 +464,9 @@ SI ATTR F table_16(const skcms_Curve* curve, F v) { // All just as in table_8() until the gathers. F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); - I32 lo = CAST(I32, ix ), - hi = CAST(I32, minus_1_ulp(ix+1.0f)); - F t = ix - CAST(F, lo); + I32 lo = cast( ix ), + hi = cast(minus_1_ulp(ix+1.0f)); + F t = ix - cast(lo); // TODO: as above, load l and h simultaneously? // Here we could even use AVX2-style 32-bit gathers. @@ -469,9 +479,9 @@ SI ATTR F table_16(const skcms_Curve* curve, F v) { SI ATTR void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { U32 rgb = gather_24(a2b->grid_8, ix); - *r = CAST(F, (rgb >> 0) & 0xff) * (1/255.0f); - *g = CAST(F, (rgb >> 8) & 0xff) * (1/255.0f); - *b = CAST(F, (rgb >> 16) & 0xff) * (1/255.0f); + *r = cast((rgb >> 0) & 0xff) * (1/255.0f); + *g = cast((rgb >> 8) & 0xff) * (1/255.0f); + *b = cast((rgb >> 16) & 0xff) * (1/255.0f); (void)a; (void)stride; @@ -486,11 +496,11 @@ SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too. U64 rgb; gather_48(a2b->grid_16, ix, &rgb); - swap_endian_16x4(&rgb); + rgb = swap_endian_16x4(rgb); - *r = CAST(F, (rgb >> 0) & 0xffff) * (1/65535.0f); - *g = CAST(F, (rgb >> 16) & 0xffff) * (1/65535.0f); - *b = CAST(F, (rgb >> 32) & 0xffff) * (1/65535.0f); + *r = cast((rgb >> 0) & 0xffff) * (1/65535.0f); + *g = cast((rgb >> 16) & 0xffff) * (1/65535.0f); + *b = cast((rgb >> 32) & 0xffff) * (1/65535.0f); #endif (void)a; (void)stride; @@ -509,22 +519,22 @@ SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* #define DEF_CLUT(I,J,B) \ MAYBE_SI ATTR \ void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \ - I32 limit = CAST(I32, F0); \ + I32 limit = cast(F0); \ limit += a2b->grid_points[I-1]; \ \ const F* srcs[] = { r,g,b,&a }; \ F src = *srcs[I-1]; \ \ - F x = max_(F0, min_(src, F1)) * CAST(F, limit - 1); \ + F x = max_(F0, min_(src, F1)) * cast(limit - 1); \ \ - I32 lo = CAST(I32, x ), \ - hi = CAST(I32, minus_1_ulp(x+1.0f)); \ + I32 lo = cast( x ), \ + hi = cast(minus_1_ulp(x+1.0f)); \ F lr = *r, lg = *g, lb = *b, \ hr = *r, hg = *g, hb = *b; \ clut_##J##_##B(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \ clut_##J##_##B(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \ \ - F t = x - CAST(F, lo); \ + F t = x - cast(lo); \ *r = lr + (hr-lr)*t; \ *g = lg + (hg-lg)*t; \ *b = lb + (hb-lb)*t; \ @@ -564,19 +574,19 @@ static void exec_ops(const Op* ops, const void** args, U16 abgr; small_memcpy(&abgr, src + 2*i, 2*N); - r = CAST(F, (abgr >> 12) & 0xf) * (1/15.0f); - g = CAST(F, (abgr >> 8) & 0xf) * (1/15.0f); - b = CAST(F, (abgr >> 4) & 0xf) * (1/15.0f); - a = CAST(F, (abgr >> 0) & 0xf) * (1/15.0f); + r = cast((abgr >> 12) & 0xf) * (1/15.0f); + g = cast((abgr >> 8) & 0xf) * (1/15.0f); + b = cast((abgr >> 4) & 0xf) * (1/15.0f); + a = cast((abgr >> 0) & 0xf) * (1/15.0f); } break; case Op_load_565:{ U16 rgb; small_memcpy(&rgb, src + 2*i, 2*N); - r = CAST(F, rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0)); - g = CAST(F, rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5)); - b = CAST(F, rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11)); + r = cast(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0)); + g = cast(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5)); + b = cast(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11)); a = F1; } break; @@ -595,13 +605,13 @@ static void exec_ops(const Op* ops, const void** args, // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to // convert to F. (Again, U32 would be even better here if drop ARMv7 or split // ARMv7 and ARMv8 impls.) - r = CAST(F, (U16)v.val[0]) * (1/255.0f); - g = CAST(F, (U16)v.val[1]) * (1/255.0f); - b = CAST(F, (U16)v.val[2]) * (1/255.0f); + r = cast((U16)v.val[0]) * (1/255.0f); + g = cast((U16)v.val[1]) * (1/255.0f); + b = cast((U16)v.val[2]) * (1/255.0f); #else - r = CAST(F, LOAD_3(U32, rgb+0) ) * (1/255.0f); - g = CAST(F, LOAD_3(U32, rgb+1) ) * (1/255.0f); - b = CAST(F, LOAD_3(U32, rgb+2) ) * (1/255.0f); + r = cast(LOAD_3(U32, rgb+0) ) * (1/255.0f); + g = cast(LOAD_3(U32, rgb+1) ) * (1/255.0f); + b = cast(LOAD_3(U32, rgb+2) ) * (1/255.0f); #endif a = F1; } break; @@ -610,20 +620,20 @@ static void exec_ops(const Op* ops, const void** args, U32 rgba; small_memcpy(&rgba, src + 4*i, 4*N); - r = CAST(F, (rgba >> 0) & 0xff) * (1/255.0f); - g = CAST(F, (rgba >> 8) & 0xff) * (1/255.0f); - b = CAST(F, (rgba >> 16) & 0xff) * (1/255.0f); - a = CAST(F, (rgba >> 24) & 0xff) * (1/255.0f); + r = cast((rgba >> 0) & 0xff) * (1/255.0f); + g = cast((rgba >> 8) & 0xff) * (1/255.0f); + b = cast((rgba >> 16) & 0xff) * (1/255.0f); + a = cast((rgba >> 24) & 0xff) * (1/255.0f); } break; case Op_load_1010102:{ U32 rgba; small_memcpy(&rgba, src + 4*i, 4*N); - r = CAST(F, (rgba >> 0) & 0x3ff) * (1/1023.0f); - g = CAST(F, (rgba >> 10) & 0x3ff) * (1/1023.0f); - b = CAST(F, (rgba >> 20) & 0x3ff) * (1/1023.0f); - a = CAST(F, (rgba >> 30) & 0x3 ) * (1/ 3.0f); + r = cast((rgba >> 0) & 0x3ff) * (1/1023.0f); + g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); + b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); + a = cast((rgba >> 30) & 0x3 ) * (1/ 3.0f); } break; case Op_load_161616:{ @@ -632,17 +642,17 @@ static void exec_ops(const Op* ops, const void** args, const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. #if defined(USING_NEON) uint16x4x3_t v = vld3_u16(rgb); - r = CAST(F, swap_endian_16((U16)v.val[0])) * (1/65535.0f); - g = CAST(F, swap_endian_16((U16)v.val[1])) * (1/65535.0f); - b = CAST(F, swap_endian_16((U16)v.val[2])) * (1/65535.0f); + r = cast(swap_endian_16((U16)v.val[0])) * (1/65535.0f); + g = cast(swap_endian_16((U16)v.val[1])) * (1/65535.0f); + b = cast(swap_endian_16((U16)v.val[2])) * (1/65535.0f); #else U32 R = LOAD_3(U32, rgb+0), G = LOAD_3(U32, rgb+1), B = LOAD_3(U32, rgb+2); // R,G,B are big-endian 16-bit, so byte swap them before converting to float. - r = CAST(F, (R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f); - g = CAST(F, (G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f); - b = CAST(F, (B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f); + r = cast((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f); + g = cast((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f); + b = cast((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f); #endif a = F1; } break; @@ -653,19 +663,19 @@ static void exec_ops(const Op* ops, const void** args, const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. #if defined(USING_NEON) uint16x4x4_t v = vld4_u16(rgba); - r = CAST(F, swap_endian_16((U16)v.val[0])) * (1/65535.0f); - g = CAST(F, swap_endian_16((U16)v.val[1])) * (1/65535.0f); - b = CAST(F, swap_endian_16((U16)v.val[2])) * (1/65535.0f); - a = CAST(F, swap_endian_16((U16)v.val[3])) * (1/65535.0f); + r = cast(swap_endian_16((U16)v.val[0])) * (1/65535.0f); + g = cast(swap_endian_16((U16)v.val[1])) * (1/65535.0f); + b = cast(swap_endian_16((U16)v.val[2])) * (1/65535.0f); + a = cast(swap_endian_16((U16)v.val[3])) * (1/65535.0f); #else U64 px; small_memcpy(&px, rgba, 8*N); - swap_endian_16x4(&px); - r = CAST(F, (px >> 0) & 0xffff) * (1/65535.0f); - g = CAST(F, (px >> 16) & 0xffff) * (1/65535.0f); - b = CAST(F, (px >> 32) & 0xffff) * (1/65535.0f); - a = CAST(F, (px >> 48) & 0xffff) * (1/65535.0f); + px = swap_endian_16x4(px); + r = cast((px >> 0) & 0xffff) * (1/65535.0f); + g = cast((px >> 16) & 0xffff) * (1/65535.0f); + b = cast((px >> 32) & 0xffff) * (1/65535.0f); + a = cast((px >> 48) & 0xffff) * (1/65535.0f); #endif } break; @@ -702,10 +712,10 @@ static void exec_ops(const Op* ops, const void** args, #else U64 px; small_memcpy(&px, rgba, 8*N); - U16 R = CAST(U16, (px >> 0) & 0xffff), - G = CAST(U16, (px >> 16) & 0xffff), - B = CAST(U16, (px >> 32) & 0xffff), - A = CAST(U16, (px >> 48) & 0xffff); + U16 R = cast((px >> 0) & 0xffff), + G = cast((px >> 16) & 0xffff), + B = cast((px >> 32) & 0xffff), + A = cast((px >> 48) & 0xffff); #endif r = F_from_Half(R); g = F_from_Half(G); @@ -849,24 +859,24 @@ static void exec_ops(const Op* ops, const void** args, case Op_clut_3D_8:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - clut_3_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_3_8(a2b, cast(F0),cast(F1), &r,&g,&b,a); } break; case Op_clut_3D_16:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - clut_3_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_3_16(a2b, cast(F0),cast(F1), &r,&g,&b,a); } break; case Op_clut_4D_8:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - clut_4_8(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_4_8(a2b, cast(F0),cast(F1), &r,&g,&b,a); // 'a' was really a CMYK K, so our output is actually opaque. a = F1; } break; case Op_clut_4D_16:{ const skcms_A2B* a2b = (const skcms_A2B*) *args++; - clut_4_16(a2b, CAST(I32,F0),CAST(I32,F1), &r,&g,&b,a); + clut_4_16(a2b, cast(F0),cast(F1), &r,&g,&b,a); // 'a' was really a CMYK K, so our output is actually opaque. a = F1; } break; @@ -874,28 +884,28 @@ static void exec_ops(const Op* ops, const void** args, // Notice, from here on down the store_ ops all return, ending the loop. case Op_store_a8: { - U8 alpha = CAST(U8, to_fixed(a * 255)); + U8 alpha = cast(to_fixed(a * 255)); small_memcpy(dst + i, &alpha, N); } return; case Op_store_g8: { // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z) - U8 gray = CAST(U8, to_fixed(g * 255)); + U8 gray = cast(to_fixed(g * 255)); small_memcpy(dst + i, &gray, N); } return; case Op_store_4444: { - U16 abgr = CAST(U16, to_fixed(r * 15) << 12) - | CAST(U16, to_fixed(g * 15) << 8) - | CAST(U16, to_fixed(b * 15) << 4) - | CAST(U16, to_fixed(a * 15) << 0); + U16 abgr = cast(to_fixed(r * 15) << 12) + | cast(to_fixed(g * 15) << 8) + | cast(to_fixed(b * 15) << 4) + | cast(to_fixed(a * 15) << 0); small_memcpy(dst + 2*i, &abgr, 2*N); } return; case Op_store_565: { - U16 rgb = CAST(U16, to_fixed(r * 31) << 0 ) - | CAST(U16, to_fixed(g * 63) << 5 ) - | CAST(U16, to_fixed(b * 31) << 11 ); + U16 rgb = cast(to_fixed(r * 31) << 0 ) + | cast(to_fixed(g * 63) << 5 ) + | cast(to_fixed(b * 31) << 11 ); small_memcpy(dst + 2*i, &rgb, 2*N); } return; @@ -905,9 +915,9 @@ static void exec_ops(const Op* ops, const void** args, // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but // get there via U16 to save some instructions converting to float. And just // like load_888, we'd prefer to go via U32 but for ARMv7 support. - U16 R = CAST(U16, to_fixed(r * 255)), - G = CAST(U16, to_fixed(g * 255)), - B = CAST(U16, to_fixed(b * 255)); + U16 R = cast(to_fixed(r * 255)), + G = cast(to_fixed(g * 255)), + B = cast(to_fixed(b * 255)); uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }}; vst3_lane_u8(rgb+0, v, 0); @@ -915,25 +925,25 @@ static void exec_ops(const Op* ops, const void** args, vst3_lane_u8(rgb+6, v, 4); vst3_lane_u8(rgb+9, v, 6); #else - STORE_3(rgb+0, CAST(U8, to_fixed(r * 255)) ); - STORE_3(rgb+1, CAST(U8, to_fixed(g * 255)) ); - STORE_3(rgb+2, CAST(U8, to_fixed(b * 255)) ); + STORE_3(rgb+0, cast(to_fixed(r * 255)) ); + STORE_3(rgb+1, cast(to_fixed(g * 255)) ); + STORE_3(rgb+2, cast(to_fixed(b * 255)) ); #endif } return; case Op_store_8888: { - U32 rgba = CAST(U32, to_fixed(r * 255) << 0) - | CAST(U32, to_fixed(g * 255) << 8) - | CAST(U32, to_fixed(b * 255) << 16) - | CAST(U32, to_fixed(a * 255) << 24); + U32 rgba = cast(to_fixed(r * 255) << 0) + | cast(to_fixed(g * 255) << 8) + | cast(to_fixed(b * 255) << 16) + | cast(to_fixed(a * 255) << 24); small_memcpy(dst + 4*i, &rgba, 4*N); } return; case Op_store_1010102: { - U32 rgba = CAST(U32, to_fixed(r * 1023) << 0) - | CAST(U32, to_fixed(g * 1023) << 10) - | CAST(U32, to_fixed(b * 1023) << 20) - | CAST(U32, to_fixed(a * 3) << 30); + U32 rgba = cast(to_fixed(r * 1023) << 0) + | cast(to_fixed(g * 1023) << 10) + | cast(to_fixed(b * 1023) << 20) + | cast(to_fixed(a * 3) << 30); small_memcpy(dst + 4*i, &rgba, 4*N); } return; @@ -943,18 +953,18 @@ static void exec_ops(const Op* ops, const void** args, uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. #if defined(USING_NEON) uint16x4x3_t v = {{ - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(r * 65535))), - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(g * 65535))), - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(b * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(r * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(g * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(b * 65535))), }}; vst3_u16(rgb, v); #else I32 R = to_fixed(r * 65535), G = to_fixed(g * 65535), B = to_fixed(b * 65535); - STORE_3(rgb+0, CAST(U16, (R & 0x00ff) << 8 | (R & 0xff00) >> 8) ); - STORE_3(rgb+1, CAST(U16, (G & 0x00ff) << 8 | (G & 0xff00) >> 8) ); - STORE_3(rgb+2, CAST(U16, (B & 0x00ff) << 8 | (B & 0xff00) >> 8) ); + STORE_3(rgb+0, cast((R & 0x00ff) << 8 | (R & 0xff00) >> 8) ); + STORE_3(rgb+1, cast((G & 0x00ff) << 8 | (G & 0xff00) >> 8) ); + STORE_3(rgb+2, cast((B & 0x00ff) << 8 | (B & 0xff00) >> 8) ); #endif } return; @@ -965,18 +975,18 @@ static void exec_ops(const Op* ops, const void** args, uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. #if defined(USING_NEON) uint16x4x4_t v = {{ - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(r * 65535))), - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(g * 65535))), - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(b * 65535))), - (uint16x4_t)swap_endian_16(CAST(U16, to_fixed(a * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(r * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(g * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(b * 65535))), + (uint16x4_t)swap_endian_16(cast(to_fixed(a * 65535))), }}; vst4_u16(rgba, v); #else - U64 px = CAST(U64, to_fixed(r * 65535)) << 0 - | CAST(U64, to_fixed(g * 65535)) << 16 - | CAST(U64, to_fixed(b * 65535)) << 32 - | CAST(U64, to_fixed(a * 65535)) << 48; - swap_endian_16x4(&px); + U64 px = cast(to_fixed(r * 65535)) << 0 + | cast(to_fixed(g * 65535)) << 16 + | cast(to_fixed(b * 65535)) << 32 + | cast(to_fixed(a * 65535)) << 48; + px = swap_endian_16x4(px); small_memcpy(rgba, &px, 8*N); #endif } return; @@ -1021,10 +1031,10 @@ static void exec_ops(const Op* ops, const void** args, }}; vst4_u16(rgba, v); #else - U64 px = CAST(U64, R) << 0 - | CAST(U64, G) << 16 - | CAST(U64, B) << 32 - | CAST(U64, A) << 48; + U64 px = cast(R) << 0 + | cast(G) << 16 + | cast(B) << 32 + | cast(A) << 48; small_memcpy(rgba, &px, 8*N); #endif @@ -1091,10 +1101,6 @@ static void run_program(const Op* program, const void** arguments, } } -#if defined(USING_NEON) && defined(__clang__) - #pragma clang diagnostic pop -#endif - // Clean up any #defines we may have set so that we can be #included again. #if defined(USING_NEON) @@ -1109,8 +1115,6 @@ static void run_program(const Op* program, const void** arguments, #undef USING_AVX_F16C #endif -#undef CAST - #if defined(LOAD_3) #undef LOAD_3 #endif diff --git a/third_party/skcms/version.sha1 b/third_party/skcms/version.sha1 index 606390d341..9e2b83f11a 100755 --- a/third_party/skcms/version.sha1 +++ b/third_party/skcms/version.sha1 @@ -1 +1 @@ -779b8c7799f39cbc833612a5ffbf877aedabe6f5 \ No newline at end of file +c663954a7567fdb9b26222c11916fe8ab714f2b8 \ No newline at end of file -- cgit v1.2.3