bench/benchmark-blocking-sizes.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <fstream>

int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
#include <Eigen/Core>

#include <bench/BenchTimer.h>

using namespace Eigen;
using namespace std;

static BenchTimer timer;

// how many times we repeat each measurement.
// measurements are randomly shuffled - we're not doing
// all N identical measurements in a row.
const int measurement_repetitions = 3;

// Timings below this value are too short to be accurate,
// we'll repeat measurements with more iterations until
// we get a timing above that threshold.
const float min_accurate_time = 1e-2f;

// See --min-working-set-size command line parameter.
size_t min_working_set_size = 0;

// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
const size_t maxsize = 2048;
const size_t minsize = 16;

typedef MatrixXf MatrixType;

static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
static_assert(maxsize > minsize, "maxsize must be larger than minsize");
static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");

// just a helper to store a triple of K,M,N sizes for matrix product
struct size_triple_t
{
  size_t k, m, n;
  size_triple_t() : k(0), m(0), n(0) {}
  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
  size_triple_t(uint16_t compact)
  {
    k = 1 << ((compact & 0xf00) >> 8);
    m = 1 << ((compact & 0x0f0) >> 4);
    n = 1 << ((compact & 0x00f) >> 0);
  }
};

uint8_t log2_pot(size_t x) {
  size_t l = 0;
  while (x >>= 1) l++;
  return l;
}

// Convert between size tripes and a compact form fitting in 12 bits
// where each size, which must be a POT, is encoded as its log2, on 4 bits
// so the largest representable size is 2^15 == 32k  ... big enough.
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
{
  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
}

uint16_t compact_size_triple(const size_triple_t& t)
{
  return compact_size_triple(t.k, t.m, t.n);
}

// A single benchmark. Initially only contains benchmark params.
// Then call run(), which stores the result in the gflops field.
struct benchmark_t
{
  uint16_t compact_product_size;
  uint16_t compact_block_size;
  float gflops;
  benchmark_t()
    : compact_product_size(0)
    , compact_block_size(0)
    , gflops(0)
  {}
  benchmark_t(size_t pk, size_t pm, size_t pn,
              size_t bk, size_t bm, size_t bn)
    : compact_product_size(compact_size_triple(pk, pm, pn))
    , compact_block_size(compact_size_triple(bk, bm, bn))
    , gflops(0)
  {}

  void run();
};

ostream& operator<<(ostream& s, const benchmark_t& b)
{
  s << hex;
  s << b.compact_product_size
    << " " << b.compact_block_size;
  s << dec;
  s << " " << b.gflops;
  return s;
}

// We sort first by increasing benchmark parameters,
// then by decreasing performance.
bool operator<(const benchmark_t& b1, const benchmark_t& b2)
{ 
  return b1.compact_product_size < b2.compact_product_size ||
           (b1.compact_product_size == b2.compact_product_size && (
             (b1.compact_block_size < b2.compact_block_size || (
               b1.compact_block_size == b2.compact_block_size &&
                 b1.gflops > b2.gflops))));
}

void benchmark_t::run()
{
  // expand our compact benchmark params into proper triples
  size_triple_t productsizes(compact_product_size);
  size_triple_t blocksizes(compact_block_size);
  
  // feed eigen with our custom blocking params
  eigen_block_size_k = blocksizes.k;
  eigen_block_size_m = blocksizes.m;
  eigen_block_size_n = blocksizes.n;

  // set up the matrix pool

  const size_t combined_three_matrices_sizes =
    sizeof(MatrixType::Scalar) *
      (productsizes.k * productsizes.m +
       productsizes.k * productsizes.n +
       productsizes.m * productsizes.n);

  // 64 M is large enough that nobody has a cache bigger than that,
  // while still being small enough that everybody has this much RAM,
  // so conveniently we don't need to special-case platforms here.
  const size_t unlikely_large_cache_size = 64 << 20;

  const size_t working_set_size =
    min_working_set_size ? min_working_set_size : unlikely_large_cache_size;

  const size_t matrix_pool_size =
    1 + working_set_size / combined_three_matrices_sizes;

  MatrixType *lhs = new MatrixType[matrix_pool_size];
  MatrixType *rhs = new MatrixType[matrix_pool_size];
  MatrixType *dst = new MatrixType[matrix_pool_size];
  
  for (size_t i = 0; i < matrix_pool_size; i++) {
    lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
    rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
    dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
  }

  // main benchmark loop

  int iters_at_a_time = 1;
  float time_per_iter = 0.0f;
  size_t matrix_index = 0;
  while (true) {

    double starttime = timer.getCpuTime();
    for (int i = 0; i < iters_at_a_time; i++) {
      dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index];
      matrix_index++;
      if (matrix_index == matrix_pool_size) {
        matrix_index = 0;
      }
    }
    double endtime = timer.getCpuTime();

    const float timing = float(endtime - starttime);

    if (timing >= min_accurate_time) {
      time_per_iter = timing / iters_at_a_time;
      break;
    }

    iters_at_a_time *= 2;
  }

  delete[] lhs;
  delete[] rhs;
  delete[] dst;

  gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
}

void print_cpuinfo()
{
#ifdef __linux__
  cout << "contents of /proc/cpuinfo:" << endl;
  string line;
  ifstream cpuinfo("/proc/cpuinfo");
  if (cpuinfo.is_open()) {
    while (getline(cpuinfo, line)) {
      cout << line << endl;
    }
    cpuinfo.close();
  }
  cout << endl;
#elif defined __APPLE__
  cout << "output of sysctl hw:" << endl;
  system("sysctl hw");
  cout << endl;
#endif
}

template <typename T>
string type_name()
{
  return "unknown";
}

template<>
string type_name<float>()
{
  return "float";
}

template<>
string type_name<double>()
{
  return "double";
}

void show_usage_and_exit(const char *progname)
{
  cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl;
  cerr << "  --min-working-set-size=N:" << endl;
  cerr << "       Set the minimum working set size to N bytes." << endl;
  cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
  cerr << "       A larger working set lowers the chance of a warm cache." << endl;
  cerr << "       The default value 0 means use a large enough working" << endl;
  cerr << "       set to likely outsize caches." << endl;
  cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
  cerr << "       avoid warm caches." << endl;
  exit(1);
}

int main(int argc, char* argv[])
{
  for (int i = 1; i < argc; i++) {
    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
      const char* equals_sign = strchr(argv[i], '=');
      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
    } else {
      cerr << "unrecognized option: " << argv[i] << endl << endl;
      show_usage_and_exit(argv[0]);
    }
  }

  cout.precision(4);

  print_cpuinfo();

  cout << "benchmark parameters:" << endl;
  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
  cout << "minsize = " << minsize << endl;
  cout << "maxsize = " << maxsize << endl;
  cout << "measurement_repetitions = " << measurement_repetitions << endl;
  cout << "min_accurate_time = " << min_accurate_time << endl;
  cout << "min_working_set_size = " << min_working_set_size;
  if (min_working_set_size == 0) {
    cout << " (try to outsize caches)";
  }
  cout << endl << endl;


  // assemble the array of benchmarks without running them at first
  vector<benchmark_t> benchmarks;
  for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
    for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
      for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
        for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
          for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
            for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
              for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
                benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock);
                benchmarks.push_back(b);
              }
            }
          }
        }
      }
    }
  }

  // randomly shuffling benchmarks allows us to get accurate enough progress info,
  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
  random_shuffle(benchmarks.begin(), benchmarks.end());

  // timings here are only used to display progress info.
  // Whence the use of real time.
  double time_start = timer.getRealTime();
  double time_last_progress_update = time_start;
  for (size_t i = 0; i < benchmarks.size(); i++) {
    // Display progress info on stderr
    double time_now = timer.getRealTime();
    if (time_now > time_last_progress_update + 1.0f) {
      time_last_progress_update = time_now;
      float ratio_done = float(i) / benchmarks.size();
      cerr.precision(3);
      cerr << "Measurements... " << 100.0f * ratio_done
           << " %";

      if (i > 10) {
        cerr << ", ETA ";
        float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done;
        if (eta > 3600)
          cerr << eta/3600 << " hours";
        else if (eta > 60)
          cerr << eta/60 << " minutes";
        else cerr << eta << " seconds";
      }
      cerr << "                                              \r" << flush;
    }

    // This is where we actually run a benchmark!
    benchmarks[i].run();
  }

  // Erase progress info
  cerr << "                                                            " << endl;

  // Sort timings by increasing benchmark parameters, and decreasing gflops.
  // The latter is very important. It means that we can ignore all but the first
  // benchmark with given parameters.
  sort(benchmarks.begin(), benchmarks.end());

  // Collect best (i.e. now first) results for each parameter values.
  vector<benchmark_t> best_benchmarks;
  for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
    if (best_benchmarks.empty() ||
        best_benchmarks.back().compact_product_size != it->compact_product_size ||
        best_benchmarks.back().compact_block_size != it->compact_block_size)
    {
      best_benchmarks.push_back(*it);
    }
  }

  // Output data.
  cout << "BEGIN MEASUREMENTS" << endl;
  for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) {
    cout << *it << endl;
  }
}