tensorflow/stream_executor/device_description.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370

// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
// device and platform properties. Also contains convenience functions for
// checking/calculating launch dimensionality based on device properties.

#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_

#include <map>
#include <memory>
#include "tensorflow/stream_executor/platform/port.h"

#include "tensorflow/stream_executor/launch_dim.h"
#include "tensorflow/stream_executor/platform/port.h"

namespace perftools {
namespace gputools {
namespace internal {
class DeviceDescriptionBuilder;
}  // namespace internal

// Data that describes the execution target of the StreamExecutor, in terms of
// important logical parameters. These include dimensionality limits and
// physical parameters of interest, such as number of cores present on the
// device.
//
// Thread-safe: immutable post-initialization.
class DeviceDescription {
 public:
  // Returns the platform being run on; this value is primarily intended for
  // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
  // 3.5".
  const string &platform_version() const { return platform_version_; }

  // Returns the driver version interfacing with the underlying platform. Vendor
  // dependent format.
  const string &driver_version() const { return driver_version_; }

  // Return the runtime version, if one is provided by the underlying platform.
  // Vendor dependent format / usefulness.
  const string &runtime_version() const { return runtime_version_; }

  // Returns the name that the device reports. Vendor dependent.
  const string &name() const { return name_; }

  // Returns the PCI bus identifier for this device, of the form
  // [domain]:[bus]:[device].[function]
  const string &pci_bus_id() const { return pci_bus_id_; }

  // Returns the NUMA node associated with this device, for use in
  // determining socket locality. If the NUMA node could not be determined, -1
  // is returned.
  int numa_node() const { return numa_node_; }

  // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
  // or an AMD Compute Unit.
  int core_count() const { return core_count_; }

  // Returns the limit on the thread dimensionality values in each of the
  // respective dimensions. These limits affect what constitutes a legitimate
  // kernel launch request.
  const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }

  // Returns the limit on the block dimensionality values in each of the
  // respective dimensions. These limits may affect what constitutes a
  // legitimate kernel launch request.
  const BlockDim &block_dim_limit() const { return block_dim_limit_; }

  // Returns the limit on the number of simultaneously resident blocks
  // on a multiprocessor.
  const uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }

  // Returns the limit on the total number of threads that can be launched in a
  // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
  // This limit affects what constitutes a legitimate kernel launch request.
  const uint64 &threads_per_block_limit() const {
    return threads_per_block_limit_;
  }

  // Returns the limit on the total number of threads that can be simultaneously
  // launched on a given multiprocessor.
  const uint64 &threads_per_core_limit() const {
    return threads_per_core_limit_;
  }

  // Returns the number of threads per warp/wavefront.
  const uint64 &threads_per_warp() const { return threads_per_warp_; }

  // Returns the limit on the total number of registers per core.
  const uint64 &registers_per_core_limit() const {
    return registers_per_core_limit_;
  }

  // Returns the limit on the total number of registers that can be
  // simultaneously used by a block.
  const uint64 &registers_per_block_limit() const {
    return registers_per_block_limit_;
  }

  // Returns the limit on the total number of registers that can be
  // allocated to a thread.
  const uint64 &registers_per_thread_limit() const {
    return registers_per_thread_limit_;
  }

  // Returns the granularity at which warps are allocated resources.
  const uint64 &warp_alloc_granularity() const {
    return warp_alloc_granularity_;
  }

  // Returns the granularity at which registers are allocated to warps.
  const uint64 &register_alloc_granularity() const {
    return register_alloc_granularity_;
  }

  // Returns the granularity at which shared memory is allocated to warps.
  const uint64 &shared_memory_alloc_granularity() const {
    return shared_memory_alloc_granularity_;
  }

  // Returns the number of address bits available to kernel code running on the
  // platform. This affects things like the maximum allocation size and perhaps
  // types used in kernel code such as size_t.
  const uint64 &device_address_bits() const { return device_address_bits_; }

  // Returns the device memory size in bytes.
  uint64 device_memory_size() const { return device_memory_size_; }

  // Returns the device's core clock rate in GHz.
  const float clock_rate_ghz() const { return clock_rate_ghz_; }

  // Returns whether ECC is enabled.
  bool ecc_enabled() const { return ecc_enabled_; }

  // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
  // Micro Devices, Inc.", or "GenuineIntel".
  const string &device_vendor() const { return device_vendor_; }

  // Returns the CUDA compute capability if we're running on the CUDA platform.
  // If a CUDA compute capability is not available, the major version will be
  // zero, and the return value will be false.
  bool cuda_compute_capability(int *major, int *minor) const;

  // Returns the maximum amount of shared memory present on a single core
  // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
  // devices). Note that some devices, such as NVIDIA's have a configurable
  // partitioning between shared memory and L1 cache.
  uint64 shared_memory_per_core() const { return shared_memory_per_core_; }

  // Returns the maximum amount of shared memory available for a single block.
  uint64 shared_memory_per_block() const { return shared_memory_per_block_; }

  // TODO(leary): resident blocks per core will be useful.

  // Convenience typedef for the string-based DeviceDescription mapping.
  typedef std::map<string, string> Map;

  // Returns a mapping from readable names to readable values that describe the
  // device. This is useful for things like printing.
  std::unique_ptr<Map> ToMap() const;

  // For string values that are not available via the underlying platform, this
  // value will be provided.
  static const char *kUndefinedString;

 private:
  friend class internal::DeviceDescriptionBuilder;

  DeviceDescription();

  // For description of the following members, see the corresponding accessor
  // above.
  //
  // N.B. If another field is added, update ToMap() above.
  string device_vendor_;
  string platform_version_;
  string driver_version_;
  string runtime_version_;
  string pci_bus_id_;
  string name_;

  ThreadDim thread_dim_limit_;
  BlockDim block_dim_limit_;

  uint64 blocks_per_core_limit_;

  uint64 threads_per_core_limit_;
  uint64 threads_per_block_limit_;
  uint64 threads_per_warp_;

  uint64 registers_per_core_limit_;
  uint64 registers_per_block_limit_;
  uint64 registers_per_thread_limit_;

  uint64 warp_alloc_granularity_;
  uint64 register_alloc_granularity_;
  uint64 shared_memory_alloc_granularity_;

  uint64 device_address_bits_;
  uint64 device_memory_size_;

  // Shared memory limits on a given device.
  uint64 shared_memory_per_core_;
  uint64 shared_memory_per_block_;

  float clock_rate_ghz_;

  // CUDA "CC" major value, -1 if not available.
  int cuda_compute_capability_major_;
  int cuda_compute_capability_minor_;

  int numa_node_;
  int core_count_;
  bool ecc_enabled_;

  SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
};

namespace internal {

// Helper class the builds a device description, given that it has a large
// number of fields that would be easily confused in constructor form.
class DeviceDescriptionBuilder {
 public:
  DeviceDescriptionBuilder();

  // For descriptions of the following fields, see comments on the corresponding
  // DeviceDescription::* accessors above.

  void set_device_vendor(const string &value) {
    device_description_->device_vendor_ = value;
  }
  void set_platform_version(const string &value) {
    device_description_->platform_version_ = value;
  }
  void set_driver_version(const string &value) {
    device_description_->driver_version_ = value;
  }
  void set_runtime_version(const string &value) {
    device_description_->runtime_version_ = value;
  }
  void set_pci_bus_id(const string &value) {
    device_description_->pci_bus_id_ = value;
  }
  void set_name(const string &value) { device_description_->name_ = value; }

  void set_thread_dim_limit(const ThreadDim &value) {
    device_description_->thread_dim_limit_ = value;
  }
  void set_block_dim_limit(const BlockDim &value) {
    device_description_->block_dim_limit_ = value;
  }

  void set_blocks_per_core_limit(uint64 value) {
    device_description_->blocks_per_core_limit_ = value;
  }

  void set_threads_per_core_limit(uint64 value) {
    device_description_->threads_per_core_limit_ = value;
  }
  void set_threads_per_block_limit(uint64 value) {
    device_description_->threads_per_block_limit_ = value;
  }
  void set_threads_per_warp(uint64 value) {
    device_description_->threads_per_warp_ = value;
  }

  void set_registers_per_core_limit(uint64 value) {
    device_description_->registers_per_core_limit_ = value;
  }
  void set_registers_per_block_limit(uint64 value) {
    device_description_->registers_per_block_limit_ = value;
  }
  void set_registers_per_thread_limit(uint64 value) {
    device_description_->registers_per_thread_limit_ = value;
  }

  void set_warp_alloc_granularity(uint64 value) {
    device_description_->warp_alloc_granularity_ = value;
  }
  void set_register_alloc_granularity(uint64 value) {
    device_description_->register_alloc_granularity_ = value;
  }
  void set_shared_memory_alloc_granularity(uint64 value) {
    device_description_->shared_memory_alloc_granularity_ = value;
  }

  void set_device_address_bits(uint64 value) {
    device_description_->device_address_bits_ = value;
  }
  void set_device_memory_size(uint64 value) {
    device_description_->device_memory_size_ = value;
  }

  void set_shared_memory_per_core(int64 value) {
    device_description_->shared_memory_per_core_ = value;
  }
  void set_shared_memory_per_block(int64 value) {
    device_description_->shared_memory_per_block_ = value;
  }

  void set_clock_rate_ghz(float value) {
    device_description_->clock_rate_ghz_ = value;
  }

  void set_cuda_compute_capability(int major, int minor) {
    device_description_->cuda_compute_capability_major_ = major;
    device_description_->cuda_compute_capability_minor_ = minor;
  }

  void set_numa_node(int value) { device_description_->numa_node_ = value; }
  void set_core_count(int value) { device_description_->core_count_ = value; }
  void set_ecc_enabled(bool value) {
    device_description_->ecc_enabled_ = value;
  }

  // Returns a built DeviceDescription with ownership transferred to the
  // caller. There are currently no restrictions on which fields must be set in
  // order to build the descriptor.
  //
  // Once the description is built, this builder object should be discarded.
  std::unique_ptr<DeviceDescription> Build() {
    return std::move(device_description_);
  }

 private:
  std::unique_ptr<DeviceDescription> device_description_;

  SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
};

}  // namespace internal

// Returns whether the given thread_dim is acceptable given the limits described
// in device_description. For detailed reasons for failing the predicate, enable
// VLOG(2) for this module.
bool ThreadDimOk(const DeviceDescription &device_description,
                 const ThreadDim &thread_dim);

// [deprecated] Use MathUtil::CeilOfRatio directly instead.
//
// Equivalent to ceil(double(element_count) / threads_per_block).
uint64 DivideCeil(uint64 x, uint64 y);

// Calculate the number of threads/blocks required to process element_count
// elements. Note that you can still end up with more threads than
// element_count due to rounding, so kernels often start with an "is this
// thread id in the element_count range?" test.
void CalculateDimensionality(const DeviceDescription &device_description,
                             uint64 element_count, uint64 *threads_per_block,
                             uint64 *block_count);

// Compute and return maximum blocks per core (occupancy) based on the
// device description, some kernel characteristics and the number of threads per
// block.  If unable to compute occupancy, zero is returned.
uint64 CalculateOccupancy(const DeviceDescription &device_description,
                          uint64 registers_per_thread,
                          uint64 shared_memory_per_block,
                          const ThreadDim &thread_dims);

// Compute and return the maximum number of registers per thread which
// achieves the target occupancy.  If the target is not possible then
// zero is returned.
uint64 CalculateRegisterLimitForTargetOccupancy(
    const DeviceDescription &device_description, uint64 shared_memory_per_block,
    const ThreadDim &thread_dims, uint64 target_blocks_per_core);

}  // namespace gputools
}  // namespace perftools

#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_