1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
/*
* Copyright 2014 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkTaskGroup_DEFINED
#define SkTaskGroup_DEFINED
#include "SkTypes.h"
#include "SkAtomics.h"
#include "SkTemplates.h"
struct SkRunnable;
class SkTaskGroup : SkNoncopyable {
public:
// Create one of these in main() to enable SkTaskGroups globally.
struct Enabler : SkNoncopyable {
explicit Enabler(int threads = -1); // Default is system-reported core count.
~Enabler();
};
SkTaskGroup();
~SkTaskGroup() { this->wait(); }
// Add a task to this SkTaskGroup. It will likely run on another thread.
// Neither add() method takes owership of any of its parameters.
void add(SkRunnable*);
template <typename T>
void add(void (*fn)(T*), T* arg) { this->add((void_fn)fn, (void*)arg); }
// Add a batch of N tasks, all calling fn with different arguments.
// Equivalent to a loop over add(fn, arg), but with perhaps less synchronization overhead.
template <typename T>
void batch(void (*fn)(T*), T* args, int N) { this->batch((void_fn)fn, args, N, sizeof(T)); }
// Block until all Tasks previously add()ed to this SkTaskGroup have run.
// You may safely reuse this SkTaskGroup after wait() returns.
void wait();
private:
typedef void(*void_fn)(void*);
void add (void_fn, void* arg);
void batch(void_fn, void* args, int N, size_t stride);
SkAtomic<int32_t> fPending;
};
// Returns best estimate of number of CPU cores available to use.
int sk_num_cores();
// Call f(i) for i in [0, end).
template <typename Func>
void sk_parallel_for(int end, const Func& f) {
if (end <= 0) { return; }
struct Chunk {
const Func* f;
int start, end;
};
// TODO(mtklein): this chunking strategy could probably use some tuning.
int max_chunks = sk_num_cores() * 2,
stride = (end + max_chunks - 1 ) / max_chunks,
nchunks = (end + stride - 1 ) / stride;
SkASSERT(nchunks <= max_chunks);
// With the chunking strategy above this won't malloc until we have a machine with >512 cores.
SkAutoSTMalloc<1024, Chunk> chunks(nchunks);
for (int i = 0; i < nchunks; i++) {
Chunk& c = chunks[i];
c.f = &f;
c.start = i * stride;
c.end = SkTMin(c.start + stride, end);
SkASSERT(c.start < c.end); // Nothing will break if start >= end, but it's a wasted chunk.
}
void(*run_chunk)(Chunk*) = [](Chunk* c) {
for (int i = c->start; i < c->end; i++) {
(*c->f)(i);
}
};
SkTaskGroup().batch(run_chunk, chunks.get(), nchunks);
}
#endif//SkTaskGroup_DEFINED
|