src/core/SkTaskGroup.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

/*
 * Copyright 2014 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkTaskGroup_DEFINED
#define SkTaskGroup_DEFINED

#include "SkTypes.h"
#include "SkAtomics.h"
#include "SkTemplates.h"

struct SkRunnable;

class SkTaskGroup : SkNoncopyable {
public:
    // Create one of these in main() to enable SkTaskGroups globally.
    struct Enabler : SkNoncopyable {
        explicit Enabler(int threads = -1);  // Default is system-reported core count.
        ~Enabler();
    };

    SkTaskGroup();
    ~SkTaskGroup() { this->wait(); }

    // Add a task to this SkTaskGroup.  It will likely run on another thread.
    // Neither add() method takes owership of any of its parameters.
    void add(SkRunnable*);

    template <typename T>
    void add(void (*fn)(T*), T* arg) { this->add((void_fn)fn, (void*)arg); }

    // Add a batch of N tasks, all calling fn with different arguments.
    // Equivalent to a loop over add(fn, arg), but with perhaps less synchronization overhead.
    template <typename T>
    void batch(void (*fn)(T*), T* args, int N) { this->batch((void_fn)fn, args, N, sizeof(T)); }

    // Block until all Tasks previously add()ed to this SkTaskGroup have run.
    // You may safely reuse this SkTaskGroup after wait() returns.
    void wait();

private:
    typedef void(*void_fn)(void*);

    void add  (void_fn, void* arg);
    void batch(void_fn, void* args, int N, size_t stride);

    SkAtomic<int32_t> fPending;
};

// Returns best estimate of number of CPU cores available to use.
int sk_num_cores();

// Call f(i) for i in [0, end).
template <typename Func>
void sk_parallel_for(int end, const Func& f) {
    if (end <= 0) { return; }

    struct Chunk {
        const Func* f;
        int start, end;
    };

    // TODO(mtklein): this chunking strategy could probably use some tuning.
    int max_chunks  = sk_num_cores() * 2,
        stride      = (end + max_chunks - 1 ) / max_chunks,
        nchunks     = (end + stride - 1 ) / stride;
    SkASSERT(nchunks <= max_chunks);

    // With the chunking strategy above this won't malloc until we have a machine with >512 cores.
    SkAutoSTMalloc<1024, Chunk> chunks(nchunks);

    for (int i = 0; i < nchunks; i++) {
        Chunk& c = chunks[i];
        c.f     = &f;
        c.start = i * stride;
        c.end   = SkTMin(c.start + stride, end);
        SkASSERT(c.start < c.end);  // Nothing will break if start >= end, but it's a wasted chunk.
    }

    void(*run_chunk)(Chunk*) = [](Chunk* c) {
        for (int i = c->start; i < c->end; i++) {
            (*c->f)(i);
        }
    };
    SkTaskGroup().batch(run_chunk, chunks.get(), nchunks);
}

#endif//SkTaskGroup_DEFINED