aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/skcms/src/Transform.c
blob: f07c5e7ba405ee4befee5e1c1b85fcbe30ec3a99 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
/*
 * Copyright 2018 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "../skcms.h"
#include "Curve.h"
#include "LinearAlgebra.h"
#include "Macros.h"
#include "PortableMath.h"
#include "TransferFunction.h"
#include "Transform.h"
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <string.h>

// Without this wasm would try to use the N=4 128-bit vector code path,
// which while ideal, causes tons of compiler problems.  This would be
// a good thing to revisit as emcc matures (currently 1.38.5).
#if 1 && defined(__EMSCRIPTEN_major__)
    #if !defined(SKCMS_PORTABLE)
        #define  SKCMS_PORTABLE
    #endif
#endif

extern bool g_skcms_dump_profile;
bool g_skcms_dump_profile = false;

#if !defined(NDEBUG) && defined(__clang__)
    // Basic profiling tools to time each Op.  Not at all thread safe.

    #include <stdio.h>
    #include <stdlib.h>

    #if defined(__arm__) || defined(__aarch64__)
        #include <time.h>
        static const char* now_units = "ticks";
        static uint64_t now() { return (uint64_t)clock(); }
    #else
        static const char* now_units = "cycles";
        static uint64_t now() { return __builtin_readcyclecounter(); }
    #endif

    #define M(op) +1
    static uint64_t counts[FOREACH_Op(M)];
    #undef M

    static void profile_dump_stats() {
    #define M(op) #op,
        static const char* names[] = { FOREACH_Op(M) };
    #undef M
        for (int i = 0; i < ARRAY_COUNT(counts); i++) {
            if (counts[i]) {
                fprintf(stderr, "%16s: %12llu %s\n",
                        names[i], (unsigned long long)counts[i], now_units);
            }
        }
    }

    static inline Op profile_next_op(Op op) {
        if (__builtin_expect(g_skcms_dump_profile, false)) {
            static uint64_t start    = 0;
            static uint64_t* current = NULL;

            if (!current) {
                atexit(profile_dump_stats);
            } else {
                *current += now() - start;
            }

            current = &counts[op];
            start   = now();
        }
        return op;
    }
#else
    static inline Op profile_next_op(Op op) {
        (void)g_skcms_dump_profile;
        return op;
    }
#endif

#if defined(__clang__)
    typedef float    __attribute__((ext_vector_type(4)))   Fx4;
    typedef int32_t  __attribute__((ext_vector_type(4))) I32x4;
    typedef uint64_t __attribute__((ext_vector_type(4))) U64x4;
    typedef uint32_t __attribute__((ext_vector_type(4))) U32x4;
    typedef uint16_t __attribute__((ext_vector_type(4))) U16x4;
    typedef uint8_t  __attribute__((ext_vector_type(4)))  U8x4;

    typedef float    __attribute__((ext_vector_type(8)))   Fx8;
    typedef int32_t  __attribute__((ext_vector_type(8))) I32x8;
    typedef uint64_t __attribute__((ext_vector_type(8))) U64x8;
    typedef uint32_t __attribute__((ext_vector_type(8))) U32x8;
    typedef uint16_t __attribute__((ext_vector_type(8))) U16x8;
    typedef uint8_t  __attribute__((ext_vector_type(8)))  U8x8;

    typedef float    __attribute__((ext_vector_type(16)))   Fx16;
    typedef int32_t  __attribute__((ext_vector_type(16))) I32x16;
    typedef uint64_t __attribute__((ext_vector_type(16))) U64x16;
    typedef uint32_t __attribute__((ext_vector_type(16))) U32x16;
    typedef uint16_t __attribute__((ext_vector_type(16))) U16x16;
    typedef uint8_t  __attribute__((ext_vector_type(16)))  U8x16;
#elif defined(__GNUC__)
    typedef float    __attribute__((vector_size(16)))   Fx4;
    typedef int32_t  __attribute__((vector_size(16))) I32x4;
    typedef uint64_t __attribute__((vector_size(32))) U64x4;
    typedef uint32_t __attribute__((vector_size(16))) U32x4;
    typedef uint16_t __attribute__((vector_size( 8))) U16x4;
    typedef uint8_t  __attribute__((vector_size( 4)))  U8x4;

    typedef float    __attribute__((vector_size(32)))   Fx8;
    typedef int32_t  __attribute__((vector_size(32))) I32x8;
    typedef uint64_t __attribute__((vector_size(64))) U64x8;
    typedef uint32_t __attribute__((vector_size(32))) U32x8;
    typedef uint16_t __attribute__((vector_size(16))) U16x8;
    typedef uint8_t  __attribute__((vector_size( 8)))  U8x8;

    typedef float    __attribute__((vector_size( 64)))   Fx16;
    typedef int32_t  __attribute__((vector_size( 64))) I32x16;
    typedef uint64_t __attribute__((vector_size(128))) U64x16;
    typedef uint32_t __attribute__((vector_size( 64))) U32x16;
    typedef uint16_t __attribute__((vector_size( 32))) U16x16;
    typedef uint8_t  __attribute__((vector_size( 16)))  U8x16;
#endif

// First, instantiate our default exec_ops() implementation using the default compiliation target.

#if defined(SKCMS_PORTABLE) || !(defined(__clang__) || defined(__GNUC__))
    #define N 1

    #define F   float
    #define U64 uint64_t
    #define U32 uint32_t
    #define I32 int32_t
    #define U16 uint16_t
    #define U8  uint8_t

    #define F0 0.0f
    #define F1 1.0f

#elif defined(__AVX512F__)
    #define N 16

    #define F     Fx16
    #define U64 U64x16
    #define U32 U32x16
    #define I32 I32x16
    #define U16 U16x16
    #define U8   U8x16

    #define F0 (F){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}
    #define F1 (F){1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}
#elif defined(__AVX__)
    #define N 8

    #define F     Fx8
    #define U64 U64x8
    #define U32 U32x8
    #define I32 I32x8
    #define U16 U16x8
    #define U8   U8x8

    #define F0 (F){0,0,0,0, 0,0,0,0}
    #define F1 (F){1,1,1,1, 1,1,1,1}
#else
    #define N 4

    #define F     Fx4
    #define U64 U64x4
    #define U32 U32x4
    #define I32 I32x4
    #define U16 U16x4
    #define U8   U8x4

    #define F0 (F){0,0,0,0}
    #define F1 (F){1,1,1,1}
#endif

#define NS(id) id
#define ATTR
    #include "Transform_inl.h"
#undef N
#undef F
#undef U64
#undef U32
#undef I32
#undef U16
#undef U8
#undef F0
#undef F1
#undef NS
#undef ATTR

// Now, instantiate any other versions of run_program() we may want for runtime detection.
#if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \
        && defined(__x86_64__) && !defined(__AVX2__)
    #define N 8
    #define F     Fx8
    #define U64 U64x8
    #define U32 U32x8
    #define I32 I32x8
    #define U16 U16x8
    #define U8   U8x8
    #define F0 (F){0,0,0,0, 0,0,0,0}
    #define F1 (F){1,1,1,1, 1,1,1,1}

    #define NS(id) id ## _hsw
    #define ATTR __attribute__((target("avx2,f16c")))

    // We check these guards to see if we have support for these features.
    // They're likely _not_ defined here in our baseline build config.
    #ifndef __AVX__
        #define __AVX__ 1
        #define UNDEF_AVX
    #endif
    #ifndef __F16C__
        #define __F16C__ 1
        #define UNDEF_F16C
    #endif
    #ifndef __AVX2__
        #define __AVX2__ 1
        #define UNDEF_AVX2
    #endif

    #include "Transform_inl.h"

    #undef N
    #undef F
    #undef U64
    #undef U32
    #undef I32
    #undef U16
    #undef U8
    #undef F0
    #undef F1
    #undef NS
    #undef ATTR

    #ifdef UNDEF_AVX
        #undef __AVX__
        #undef UNDEF_AVX
    #endif
    #ifdef UNDEF_F16C
        #undef __F16C__
        #undef UNDEF_F16C
    #endif
    #ifdef UNDEF_AVX2
        #undef __AVX2__
        #undef UNDEF_AVX2
    #endif

    #define TEST_FOR_HSW

    static bool hsw_ok_ = false;
    static void check_hsw_ok() {
        // See http://www.sandpile.org/x86/cpuid.htm

        // First, a basic cpuid(1).
        uint32_t eax, ebx, ecx, edx;
        __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
                                     : "0"(1), "2"(0));

        // Sanity check for prerequisites.
        if ((edx & (1<<25)) != (1<<25)) { return; }   // SSE
        if ((edx & (1<<26)) != (1<<26)) { return; }   // SSE2
        if ((ecx & (1<< 0)) != (1<< 0)) { return; }   // SSE3
        if ((ecx & (1<< 9)) != (1<< 9)) { return; }   // SSSE3
        if ((ecx & (1<<19)) != (1<<19)) { return; }   // SSE4.1
        if ((ecx & (1<<20)) != (1<<20)) { return; }   // SSE4.2

        if ((ecx & (3<<26)) != (3<<26)) { return; }   // XSAVE + OSXSAVE

        {
            uint32_t eax_xgetbv, edx_xgetbv;
            __asm__ __volatile__("xgetbv" : "=a"(eax_xgetbv), "=d"(edx_xgetbv) : "c"(0));
            if ((eax_xgetbv & (3<<1)) != (3<<1)) { return; }  // XMM+YMM state saved?
        }

        if ((ecx & (1<<28)) != (1<<28)) { return; }   // AVX
        if ((ecx & (1<<29)) != (1<<29)) { return; }   // F16C
        if ((ecx & (1<<12)) != (1<<12)) { return; }   // FMA  (TODO: not currently used)

        // Call cpuid(7) to check for our final AVX2 feature bit!
        __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
                                     : "0"(7), "2"(0));
        if ((ebx & (1<< 5)) != (1<< 5)) { return; }   // AVX2

        hsw_ok_ = true;
    }

    #if defined(_MSC_VER)
        #include <Windows.h>
        INIT_ONCE check_hsw_ok_once = INIT_ONCE_STATIC_INIT;

        static BOOL check_hsw_ok_InitOnce_wrapper(INIT_ONCE* once, void* param, void** ctx) {
            (void)once;
            (void)param;
            (void)ctx;
            check_hsw_ok();
            return TRUE;
        }

        static bool hsw_ok() {
            InitOnceExecuteOnce(&check_hsw_ok_once, check_hsw_ok_InitOnce_wrapper, NULL, NULL);
            return hsw_ok_;
        }
    #else
        #include <pthread.h>
        static pthread_once_t check_hsw_ok_once = PTHREAD_ONCE_INIT;

        static bool hsw_ok() {
            pthread_once(&check_hsw_ok_once, check_hsw_ok);
            return hsw_ok_;
        }
    #endif

#endif

static bool is_identity_tf(const skcms_TransferFunction* tf) {
    return tf->g == 1 && tf->a == 1
        && tf->b == 0 && tf->c == 0 && tf->d == 0 && tf->e == 0 && tf->f == 0;
}

typedef struct {
    Op          op;
    const void* arg;
} OpAndArg;

static OpAndArg select_curve_op(const skcms_Curve* curve, int channel) {
    static const struct { Op parametric, table_8, table_16; } ops[] = {
        { Op_tf_r, Op_table_8_r, Op_table_16_r },
        { Op_tf_g, Op_table_8_g, Op_table_16_g },
        { Op_tf_b, Op_table_8_b, Op_table_16_b },
        { Op_tf_a, Op_table_8_a, Op_table_16_a },
    };

    if (curve->table_entries == 0) {
        return is_identity_tf(&curve->parametric)
            ? (OpAndArg){ Op_noop, NULL }
            : (OpAndArg){ ops[channel].parametric, &curve->parametric };
    } else if (curve->table_8) {
        return (OpAndArg){ ops[channel].table_8,  curve };
    } else if (curve->table_16) {
        return (OpAndArg){ ops[channel].table_16, curve };
    }

    assert(false);
    return (OpAndArg){Op_noop,NULL};
}

static size_t bytes_per_pixel(skcms_PixelFormat fmt) {
    switch (fmt >> 1) {   // ignore rgb/bgr
        case skcms_PixelFormat_RGB_565       >> 1: return  2;
        case skcms_PixelFormat_RGB_888       >> 1: return  3;
        case skcms_PixelFormat_RGBA_8888     >> 1: return  4;
        case skcms_PixelFormat_RGBA_1010102  >> 1: return  4;
        case skcms_PixelFormat_RGB_161616    >> 1: return  6;
        case skcms_PixelFormat_RGBA_16161616 >> 1: return  8;
        case skcms_PixelFormat_RGB_hhh       >> 1: return  6;
        case skcms_PixelFormat_RGBA_hhhh     >> 1: return  8;
        case skcms_PixelFormat_RGB_fff       >> 1: return 12;
        case skcms_PixelFormat_RGBA_ffff     >> 1: return 16;
    }
    assert(false);
    return 0;
}

static bool prep_for_destination(const skcms_ICCProfile* profile,
                                 skcms_Matrix3x3* fromXYZD50,
                                 skcms_TransferFunction* invR,
                                 skcms_TransferFunction* invG,
                                 skcms_TransferFunction* invB) {
    // We only support destinations with parametric transfer functions
    // and with gamuts that can be transformed from XYZD50.
    return profile->has_trc
        && profile->has_toXYZD50
        && profile->trc[0].table_entries == 0
        && profile->trc[1].table_entries == 0
        && profile->trc[2].table_entries == 0
        && skcms_TransferFunction_invert(&profile->trc[0].parametric, invR)
        && skcms_TransferFunction_invert(&profile->trc[1].parametric, invG)
        && skcms_TransferFunction_invert(&profile->trc[2].parametric, invB)
        && skcms_Matrix3x3_invert(&profile->toXYZD50, fromXYZD50);
}

bool skcms_Transform(const void*             src,
                     skcms_PixelFormat       srcFmt,
                     skcms_AlphaFormat       srcAlpha,
                     const skcms_ICCProfile* srcProfile,
                     void*                   dst,
                     skcms_PixelFormat       dstFmt,
                     skcms_AlphaFormat       dstAlpha,
                     const skcms_ICCProfile* dstProfile,
                     size_t                  nz) {
    const size_t dst_bpp = bytes_per_pixel(dstFmt),
                 src_bpp = bytes_per_pixel(srcFmt);
    // Let's just refuse if the request is absurdly big.
    if (nz * dst_bpp > INT_MAX || nz * src_bpp > INT_MAX) {
        return false;
    }
    int n = (int)nz;

    // Null profiles default to sRGB. Passing null for both is handy when doing format conversion.
    if (!srcProfile) {
        srcProfile = skcms_sRGB_profile();
    }
    if (!dstProfile) {
        dstProfile = skcms_sRGB_profile();
    }

    // We can't transform in place unless the PixelFormats are the same size.
    if (dst == src && (dstFmt >> 1) != (srcFmt >> 1)) {
        return false;
    }
    // TODO: this check lazilly disallows U16 <-> F16, but that would actually be fine.
    // TODO: more careful alias rejection (like, dst == src + 1)?

    Op          program  [32];
    const void* arguments[32];

    Op*          ops  = program;
    const void** args = arguments;

    skcms_TransferFunction inv_dst_tf_r, inv_dst_tf_g, inv_dst_tf_b;
    skcms_Matrix3x3        from_xyz;

    switch (srcFmt >> 1) {
        default: return false;
        case skcms_PixelFormat_RGB_565       >> 1: *ops++ = Op_load_565;      break;
        case skcms_PixelFormat_RGB_888       >> 1: *ops++ = Op_load_888;      break;
        case skcms_PixelFormat_RGBA_8888     >> 1: *ops++ = Op_load_8888;     break;
        case skcms_PixelFormat_RGBA_1010102  >> 1: *ops++ = Op_load_1010102;  break;
        case skcms_PixelFormat_RGB_161616    >> 1: *ops++ = Op_load_161616;   break;
        case skcms_PixelFormat_RGBA_16161616 >> 1: *ops++ = Op_load_16161616; break;
        case skcms_PixelFormat_RGB_hhh       >> 1: *ops++ = Op_load_hhh;      break;
        case skcms_PixelFormat_RGBA_hhhh     >> 1: *ops++ = Op_load_hhhh;     break;
        case skcms_PixelFormat_RGB_fff       >> 1: *ops++ = Op_load_fff;      break;
        case skcms_PixelFormat_RGBA_ffff     >> 1: *ops++ = Op_load_ffff;     break;
    }
    if (srcFmt & 1) {
        *ops++ = Op_swap_rb;
    }

    if (srcProfile->data_color_space == skcms_Signature_CMYK) {
        // Photoshop creates CMYK images as inverse CMYK.
        // These happen to be the only ones we've _ever_ seen.
        *ops++ = Op_invert;
        // With CMYK, ignore the alpha type, to avoid changing K or conflating CMY with K.
        srcAlpha = skcms_AlphaFormat_Unpremul;
    }

    if (srcAlpha == skcms_AlphaFormat_Opaque) {
        *ops++ = Op_force_opaque;
    } else if (srcAlpha == skcms_AlphaFormat_PremulAsEncoded) {
        *ops++ = Op_unpremul;
    }

    // TODO: We can skip this work if both srcAlpha and dstAlpha are PremulLinear, and the profiles
    // are the same. Also, if dstAlpha is PremulLinear, and SrcAlpha is Opaque.
    if (dstProfile != srcProfile ||
        srcAlpha == skcms_AlphaFormat_PremulLinear ||
        dstAlpha == skcms_AlphaFormat_PremulLinear) {

        if (!prep_for_destination(dstProfile,
                                  &from_xyz, &inv_dst_tf_r, &inv_dst_tf_b, &inv_dst_tf_g)) {
            return false;
        }

        if (srcProfile->has_A2B) {
            if (srcProfile->A2B.input_channels) {
                for (int i = 0; i < (int)srcProfile->A2B.input_channels; i++) {
                    OpAndArg oa = select_curve_op(&srcProfile->A2B.input_curves[i], i);
                    if (oa.op != Op_noop) {
                        *ops++  = oa.op;
                        *args++ = oa.arg;
                    }
                }
                switch (srcProfile->A2B.input_channels) {
                    case 3: *ops++ = srcProfile->A2B.grid_8 ? Op_clut_3D_8 : Op_clut_3D_16; break;
                    case 4: *ops++ = srcProfile->A2B.grid_8 ? Op_clut_4D_8 : Op_clut_4D_16; break;
                    default: return false;
                }
                *args++ = &srcProfile->A2B;
            }

            if (srcProfile->A2B.matrix_channels == 3) {
                for (int i = 0; i < 3; i++) {
                    OpAndArg oa = select_curve_op(&srcProfile->A2B.matrix_curves[i], i);
                    if (oa.op != Op_noop) {
                        *ops++  = oa.op;
                        *args++ = oa.arg;
                    }
                }

                static const skcms_Matrix3x4 I = {{
                    {1,0,0,0},
                    {0,1,0,0},
                    {0,0,1,0},
                }};
                if (0 != memcmp(&I, &srcProfile->A2B.matrix, sizeof(I))) {
                    *ops++  = Op_matrix_3x4;
                    *args++ = &srcProfile->A2B.matrix;
                }
            }

            if (srcProfile->A2B.output_channels == 3) {
                for (int i = 0; i < 3; i++) {
                    OpAndArg oa = select_curve_op(&srcProfile->A2B.output_curves[i], i);
                    if (oa.op != Op_noop) {
                        *ops++  = oa.op;
                        *args++ = oa.arg;
                    }
                }
            }

            if (srcProfile->pcs == skcms_Signature_Lab) {
                *ops++ = Op_lab_to_xyz;
            }

        } else if (srcProfile->has_trc && srcProfile->has_toXYZD50) {
            for (int i = 0; i < 3; i++) {
                OpAndArg oa = select_curve_op(&srcProfile->trc[i], i);
                if (oa.op != Op_noop) {
                    *ops++  = oa.op;
                    *args++ = oa.arg;
                }
            }
        } else {
            return false;
        }

        // At this point our source colors are linear, either RGB (XYZ-type profiles)
        // or XYZ (A2B-type profiles). Unpremul is a linear operation (multiply by a
        // constant 1/a), so either way we can do it now if needed.
        if (srcAlpha == skcms_AlphaFormat_PremulLinear) {
            *ops++ = Op_unpremul;
        }

        // A2B sources should already be in XYZD50 at this point.
        // Others still need to be transformed using their toXYZD50 matrix.
        // N.B. There are profiles that contain both A2B tags and toXYZD50 matrices.
        // If we use the A2B tags, we need to ignore the XYZD50 matrix entirely.
        assert (srcProfile->has_A2B || srcProfile->has_toXYZD50);
        static const skcms_Matrix3x3 I = {{
            { 1.0f, 0.0f, 0.0f },
            { 0.0f, 1.0f, 0.0f },
            { 0.0f, 0.0f, 1.0f },
        }};
        const skcms_Matrix3x3* to_xyz = srcProfile->has_A2B ? &I : &srcProfile->toXYZD50;

        // There's a chance the source and destination gamuts are identical,
        // in which case we can skip the gamut transform.
        if (0 != memcmp(&dstProfile->toXYZD50, to_xyz, sizeof(skcms_Matrix3x3))) {
            // Concat the entire gamut transform into from_xyz,
            // now slightly misnamed but it's a handy spot to stash the result.
            from_xyz = skcms_Matrix3x3_concat(&from_xyz, to_xyz);
            *ops++  = Op_matrix_3x3;
            *args++ = &from_xyz;
        }

        if (dstAlpha == skcms_AlphaFormat_PremulLinear) {
            *ops++ = Op_premul;
        }

        // Encode back to dst RGB using its parametric transfer functions.
        if (!is_identity_tf(&inv_dst_tf_r)) { *ops++ = Op_tf_r; *args++ = &inv_dst_tf_r; }
        if (!is_identity_tf(&inv_dst_tf_g)) { *ops++ = Op_tf_g; *args++ = &inv_dst_tf_g; }
        if (!is_identity_tf(&inv_dst_tf_b)) { *ops++ = Op_tf_b; *args++ = &inv_dst_tf_b; }
    }

    if (dstAlpha == skcms_AlphaFormat_Opaque) {
        *ops++ = Op_force_opaque;
    } else if (dstAlpha == skcms_AlphaFormat_PremulAsEncoded) {
        *ops++ = Op_premul;
    }
    if (dstFmt & 1) {
        *ops++ = Op_swap_rb;
    }
    if (dstFmt < skcms_PixelFormat_RGB_hhh) {
        *ops++ = Op_clamp;
    }
    switch (dstFmt >> 1) {
        default: return false;
        case skcms_PixelFormat_RGB_565       >> 1: *ops++ = Op_store_565;      break;
        case skcms_PixelFormat_RGB_888       >> 1: *ops++ = Op_store_888;      break;
        case skcms_PixelFormat_RGBA_8888     >> 1: *ops++ = Op_store_8888;     break;
        case skcms_PixelFormat_RGBA_1010102  >> 1: *ops++ = Op_store_1010102;  break;
        case skcms_PixelFormat_RGB_161616    >> 1: *ops++ = Op_store_161616;   break;
        case skcms_PixelFormat_RGBA_16161616 >> 1: *ops++ = Op_store_16161616; break;
        case skcms_PixelFormat_RGB_hhh       >> 1: *ops++ = Op_store_hhh;      break;
        case skcms_PixelFormat_RGBA_hhhh     >> 1: *ops++ = Op_store_hhhh;     break;
        case skcms_PixelFormat_RGB_fff       >> 1: *ops++ = Op_store_fff;      break;
        case skcms_PixelFormat_RGBA_ffff     >> 1: *ops++ = Op_store_ffff;     break;
    }

    void (*run)(const Op*, const void**, const char*, char*, int, size_t,size_t) = run_program;
#if defined(TEST_FOR_HSW)
    if (hsw_ok()) {
        run = run_program_hsw;
    }
#endif
    run(program, arguments, src, dst, n, src_bpp,dst_bpp);
    return true;
}

static void assert_usable_as_destination(const skcms_ICCProfile* profile) {
#if defined(NDEBUG)
    (void)profile;
#else
    skcms_Matrix3x3 fromXYZD50;
    skcms_TransferFunction invR, invG, invB;
    assert(prep_for_destination(profile, &fromXYZD50, &invR, &invG, &invB));
#endif
}

bool skcms_MakeUsableAsDestination(skcms_ICCProfile* profile) {
    skcms_Matrix3x3 fromXYZD50;
    if (!profile->has_trc || !profile->has_toXYZD50
        || !skcms_Matrix3x3_invert(&profile->toXYZD50, &fromXYZD50)) {
        return false;
    }

    skcms_TransferFunction tf[3];
    for (int i = 0; i < 3; i++) {
        skcms_TransferFunction inv;
        if (profile->trc[i].table_entries == 0
            && skcms_TransferFunction_invert(&profile->trc[i].parametric, &inv)) {
            tf[i] = profile->trc[i].parametric;
            continue;
        }

        float max_error;
        // Parametric curves from skcms_ApproximateCurve() are guaranteed to be invertible.
        if (!skcms_ApproximateCurve(&profile->trc[i], &tf[i], &max_error)) {
            return false;
        }
    }

    for (int i = 0; i < 3; ++i) {
        profile->trc[i].table_entries = 0;
        profile->trc[i].parametric = tf[i];
    }

    assert_usable_as_destination(profile);
    return true;
}

bool skcms_MakeUsableAsDestinationWithSingleCurve(skcms_ICCProfile* profile) {
    // Operate on a copy of profile, so we can choose the best TF for the original curves
    skcms_ICCProfile result = *profile;
    if (!skcms_MakeUsableAsDestination(&result)) {
        return false;
    }

    int best_tf = 0;
    float min_max_error = INFINITY_;
    for (int i = 0; i < 3; i++) {
        skcms_TransferFunction inv;
        skcms_TransferFunction_invert(&result.trc[i].parametric, &inv);

        float err = 0;
        for (int j = 0; j < 3; ++j) {
            err = fmaxf_(err, skcms_MaxRoundtripError(&profile->trc[j], &inv));
        }
        if (min_max_error > err) {
            min_max_error = err;
            best_tf = i;
        }
    }

    for (int i = 0; i < 3; i++) {
        result.trc[i].parametric = result.trc[best_tf].parametric;
    }

    *profile = result;
    assert_usable_as_destination(profile);
    return true;
}