aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/device_description.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/device_description.h')
-rw-r--r--tensorflow/stream_executor/device_description.h370
1 files changed, 370 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
new file mode 100644
index 0000000000..e7b7102da5
--- /dev/null
+++ b/tensorflow/stream_executor/device_description.h
@@ -0,0 +1,370 @@
+// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
+// device and platform properties. Also contains convenience functions for
+// checking/calculating launch dimensionality based on device properties.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
+
+#include <map>
+#include <memory>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+namespace internal {
+class DeviceDescriptionBuilder;
+} // namespace internal
+
+// Data that describes the execution target of the StreamExecutor, in terms of
+// important logical parameters. These include dimensionality limits and
+// physical parameters of interest, such as number of cores present on the
+// device.
+//
+// Thread-safe: immutable post-initialization.
+class DeviceDescription {
+ public:
+ // Returns the platform being run on; this value is primarily intended for
+ // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
+ // 3.5".
+ const string &platform_version() const { return platform_version_; }
+
+ // Returns the driver version interfacing with the underlying platform. Vendor
+ // dependent format.
+ const string &driver_version() const { return driver_version_; }
+
+ // Return the runtime version, if one is provided by the underlying platform.
+ // Vendor dependent format / usefulness.
+ const string &runtime_version() const { return runtime_version_; }
+
+ // Returns the name that the device reports. Vendor dependent.
+ const string &name() const { return name_; }
+
+ // Returns the PCI bus identifier for this device, of the form
+ // [domain]:[bus]:[device].[function]
+ const string &pci_bus_id() const { return pci_bus_id_; }
+
+ // Returns the NUMA node associated with this device, for use in
+ // determining socket locality. If the NUMA node could not be determined, -1
+ // is returned.
+ int numa_node() const { return numa_node_; }
+
+ // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
+ // or an AMD Compute Unit.
+ int core_count() const { return core_count_; }
+
+ // Returns the limit on the thread dimensionality values in each of the
+ // respective dimensions. These limits affect what constitutes a legitimate
+ // kernel launch request.
+ const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
+
+ // Returns the limit on the block dimensionality values in each of the
+ // respective dimensions. These limits may affect what constitutes a
+ // legitimate kernel launch request.
+ const BlockDim &block_dim_limit() const { return block_dim_limit_; }
+
+ // Returns the limit on the number of simultaneously resident blocks
+ // on a multiprocessor.
+ const uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
+
+ // Returns the limit on the total number of threads that can be launched in a
+ // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
+ // This limit affects what constitutes a legitimate kernel launch request.
+ const uint64 &threads_per_block_limit() const {
+ return threads_per_block_limit_;
+ }
+
+ // Returns the limit on the total number of threads that can be simultaneously
+ // launched on a given multiprocessor.
+ const uint64 &threads_per_core_limit() const {
+ return threads_per_core_limit_;
+ }
+
+ // Returns the number of threads per warp/wavefront.
+ const uint64 &threads_per_warp() const { return threads_per_warp_; }
+
+ // Returns the limit on the total number of registers per core.
+ const uint64 &registers_per_core_limit() const {
+ return registers_per_core_limit_;
+ }
+
+ // Returns the limit on the total number of registers that can be
+ // simultaneously used by a block.
+ const uint64 &registers_per_block_limit() const {
+ return registers_per_block_limit_;
+ }
+
+ // Returns the limit on the total number of registers that can be
+ // allocated to a thread.
+ const uint64 &registers_per_thread_limit() const {
+ return registers_per_thread_limit_;
+ }
+
+ // Returns the granularity at which warps are allocated resources.
+ const uint64 &warp_alloc_granularity() const {
+ return warp_alloc_granularity_;
+ }
+
+ // Returns the granularity at which registers are allocated to warps.
+ const uint64 &register_alloc_granularity() const {
+ return register_alloc_granularity_;
+ }
+
+ // Returns the granularity at which shared memory is allocated to warps.
+ const uint64 &shared_memory_alloc_granularity() const {
+ return shared_memory_alloc_granularity_;
+ }
+
+ // Returns the number of address bits available to kernel code running on the
+ // platform. This affects things like the maximum allocation size and perhaps
+ // types used in kernel code such as size_t.
+ const uint64 &device_address_bits() const { return device_address_bits_; }
+
+ // Returns the device memory size in bytes.
+ uint64 device_memory_size() const { return device_memory_size_; }
+
+ // Returns the device's core clock rate in GHz.
+ const float clock_rate_ghz() const { return clock_rate_ghz_; }
+
+ // Returns whether ECC is enabled.
+ bool ecc_enabled() const { return ecc_enabled_; }
+
+ // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
+ // Micro Devices, Inc.", or "GenuineIntel".
+ const string &device_vendor() const { return device_vendor_; }
+
+ // Returns the CUDA compute capability if we're running on the CUDA platform.
+ // If a CUDA compute capability is not available, the major version will be
+ // zero, and the return value will be false.
+ bool cuda_compute_capability(int *major, int *minor) const;
+
+ // Returns the maximum amount of shared memory present on a single core
+ // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
+ // devices). Note that some devices, such as NVIDIA's have a configurable
+ // partitioning between shared memory and L1 cache.
+ uint64 shared_memory_per_core() const { return shared_memory_per_core_; }
+
+ // Returns the maximum amount of shared memory available for a single block.
+ uint64 shared_memory_per_block() const { return shared_memory_per_block_; }
+
+ // TODO(leary): resident blocks per core will be useful.
+
+ // Convenience typedef for the string-based DeviceDescription mapping.
+ typedef std::map<string, string> Map;
+
+ // Returns a mapping from readable names to readable values that describe the
+ // device. This is useful for things like printing.
+ std::unique_ptr<Map> ToMap() const;
+
+ // For string values that are not available via the underlying platform, this
+ // value will be provided.
+ static const char *kUndefinedString;
+
+ private:
+ friend class internal::DeviceDescriptionBuilder;
+
+ DeviceDescription();
+
+ // For description of the following members, see the corresponding accessor
+ // above.
+ //
+ // N.B. If another field is added, update ToMap() above.
+ string device_vendor_;
+ string platform_version_;
+ string driver_version_;
+ string runtime_version_;
+ string pci_bus_id_;
+ string name_;
+
+ ThreadDim thread_dim_limit_;
+ BlockDim block_dim_limit_;
+
+ uint64 blocks_per_core_limit_;
+
+ uint64 threads_per_core_limit_;
+ uint64 threads_per_block_limit_;
+ uint64 threads_per_warp_;
+
+ uint64 registers_per_core_limit_;
+ uint64 registers_per_block_limit_;
+ uint64 registers_per_thread_limit_;
+
+ uint64 warp_alloc_granularity_;
+ uint64 register_alloc_granularity_;
+ uint64 shared_memory_alloc_granularity_;
+
+ uint64 device_address_bits_;
+ uint64 device_memory_size_;
+
+ // Shared memory limits on a given device.
+ uint64 shared_memory_per_core_;
+ uint64 shared_memory_per_block_;
+
+ float clock_rate_ghz_;
+
+ // CUDA "CC" major value, -1 if not available.
+ int cuda_compute_capability_major_;
+ int cuda_compute_capability_minor_;
+
+ int numa_node_;
+ int core_count_;
+ bool ecc_enabled_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
+};
+
+namespace internal {
+
+// Helper class the builds a device description, given that it has a large
+// number of fields that would be easily confused in constructor form.
+class DeviceDescriptionBuilder {
+ public:
+ DeviceDescriptionBuilder();
+
+ // For descriptions of the following fields, see comments on the corresponding
+ // DeviceDescription::* accessors above.
+
+ void set_device_vendor(const string &value) {
+ device_description_->device_vendor_ = value;
+ }
+ void set_platform_version(const string &value) {
+ device_description_->platform_version_ = value;
+ }
+ void set_driver_version(const string &value) {
+ device_description_->driver_version_ = value;
+ }
+ void set_runtime_version(const string &value) {
+ device_description_->runtime_version_ = value;
+ }
+ void set_pci_bus_id(const string &value) {
+ device_description_->pci_bus_id_ = value;
+ }
+ void set_name(const string &value) { device_description_->name_ = value; }
+
+ void set_thread_dim_limit(const ThreadDim &value) {
+ device_description_->thread_dim_limit_ = value;
+ }
+ void set_block_dim_limit(const BlockDim &value) {
+ device_description_->block_dim_limit_ = value;
+ }
+
+ void set_blocks_per_core_limit(uint64 value) {
+ device_description_->blocks_per_core_limit_ = value;
+ }
+
+ void set_threads_per_core_limit(uint64 value) {
+ device_description_->threads_per_core_limit_ = value;
+ }
+ void set_threads_per_block_limit(uint64 value) {
+ device_description_->threads_per_block_limit_ = value;
+ }
+ void set_threads_per_warp(uint64 value) {
+ device_description_->threads_per_warp_ = value;
+ }
+
+ void set_registers_per_core_limit(uint64 value) {
+ device_description_->registers_per_core_limit_ = value;
+ }
+ void set_registers_per_block_limit(uint64 value) {
+ device_description_->registers_per_block_limit_ = value;
+ }
+ void set_registers_per_thread_limit(uint64 value) {
+ device_description_->registers_per_thread_limit_ = value;
+ }
+
+ void set_warp_alloc_granularity(uint64 value) {
+ device_description_->warp_alloc_granularity_ = value;
+ }
+ void set_register_alloc_granularity(uint64 value) {
+ device_description_->register_alloc_granularity_ = value;
+ }
+ void set_shared_memory_alloc_granularity(uint64 value) {
+ device_description_->shared_memory_alloc_granularity_ = value;
+ }
+
+ void set_device_address_bits(uint64 value) {
+ device_description_->device_address_bits_ = value;
+ }
+ void set_device_memory_size(uint64 value) {
+ device_description_->device_memory_size_ = value;
+ }
+
+ void set_shared_memory_per_core(int64 value) {
+ device_description_->shared_memory_per_core_ = value;
+ }
+ void set_shared_memory_per_block(int64 value) {
+ device_description_->shared_memory_per_block_ = value;
+ }
+
+ void set_clock_rate_ghz(float value) {
+ device_description_->clock_rate_ghz_ = value;
+ }
+
+ void set_cuda_compute_capability(int major, int minor) {
+ device_description_->cuda_compute_capability_major_ = major;
+ device_description_->cuda_compute_capability_minor_ = minor;
+ }
+
+ void set_numa_node(int value) { device_description_->numa_node_ = value; }
+ void set_core_count(int value) { device_description_->core_count_ = value; }
+ void set_ecc_enabled(bool value) {
+ device_description_->ecc_enabled_ = value;
+ }
+
+ // Returns a built DeviceDescription with ownership transferred to the
+ // caller. There are currently no restrictions on which fields must be set in
+ // order to build the descriptor.
+ //
+ // Once the description is built, this builder object should be discarded.
+ std::unique_ptr<DeviceDescription> Build() {
+ return std::move(device_description_);
+ }
+
+ private:
+ std::unique_ptr<DeviceDescription> device_description_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
+};
+
+} // namespace internal
+
+// Returns whether the given thread_dim is acceptable given the limits described
+// in device_description. For detailed reasons for failing the predicate, enable
+// VLOG(2) for this module.
+bool ThreadDimOk(const DeviceDescription &device_description,
+ const ThreadDim &thread_dim);
+
+// [deprecated] Use MathUtil::CeilOfRatio directly instead.
+//
+// Equivalent to ceil(double(element_count) / threads_per_block).
+uint64 DivideCeil(uint64 x, uint64 y);
+
+// Calculate the number of threads/blocks required to process element_count
+// elements. Note that you can still end up with more threads than
+// element_count due to rounding, so kernels often start with an "is this
+// thread id in the element_count range?" test.
+void CalculateDimensionality(const DeviceDescription &device_description,
+ uint64 element_count, uint64 *threads_per_block,
+ uint64 *block_count);
+
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block. If unable to compute occupancy, zero is returned.
+uint64 CalculateOccupancy(const DeviceDescription &device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim &thread_dims);
+
+// Compute and return the maximum number of registers per thread which
+// achieves the target occupancy. If the target is not possible then
+// zero is returned.
+uint64 CalculateRegisterLimitForTargetOccupancy(
+ const DeviceDescription &device_description, uint64 shared_memory_per_block,
+ const ThreadDim &thread_dims, uint64 target_blocks_per_core);
+
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_