aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/launch_dim.h
blob: 9b870ed6aac98db30a60e29fa8698fd5167a8a82 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// Types to express dimensionality of a kernel launch. Blocks and threads
// are (up to) 3-dimensional.
//
// A thread is conceptually like a SIMD lane. Some number, typically 32
// (though that fact should not be relied on) SIMD lanes are tied together with
// a single PC in a unit called a warp. There is a maximum number of threads
// that can execute in a shared-context entity called a block. Presently, that
// number is 1024 -- again, something that should not be relied on from this
// comment, but checked via perftools::gputools::DeviceDescription.
//
// For additional information, see
// http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
//
// Because of that modest thread-per-block limit, a kernel can be launched with
// multiple blocks. Each block is indivisibly scheduled onto a single core.
// Blocks can also be used in a multi-dimensional configuration, and the block
// count has much less modest limits -- typically they're similar to the maximum
// amount of addressable memory.

#ifndef TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
#define TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_

#include "tensorflow/stream_executor/platform/port.h"

#include "tensorflow/stream_executor/lib/strcat.h"
#include "tensorflow/stream_executor/platform/port.h"

namespace perftools {
namespace gputools {

// Basic type that represents a 3-dimensional index space.
struct Dim3D {
  uint64 x, y, z;

  Dim3D(uint64 x, uint64 y, uint64 z) : x(x), y(y), z(z) {}
};

// Thread dimensionality for use in a kernel launch. See file comment for
// details.
struct ThreadDim : public Dim3D {
  explicit ThreadDim(uint64 x = 1, uint64 y = 1, uint64 z = 1)
      : Dim3D(x, y, z) {}

  // Returns a string representation of the thread dimensionality.
  string ToString() const {
    return port::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
  }
};

// Block dimensionality for use in a kernel launch. See file comment for
// details.
struct BlockDim : public Dim3D {
  explicit BlockDim(uint64 x = 1, uint64 y = 1, uint64 z = 1)
      : Dim3D(x, y, z) {}

  // Returns a string representation of the block dimensionality.
  string ToString() const {
    return port::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
  }
};

}  // namespace gputools
}  // namespace perftools

#endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_