diff options
Diffstat (limited to 'tensorflow/stream_executor/device_description.h')
-rw-r--r-- | tensorflow/stream_executor/device_description.h | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h new file mode 100644 index 0000000000..e7b7102da5 --- /dev/null +++ b/tensorflow/stream_executor/device_description.h @@ -0,0 +1,370 @@ +// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA +// device and platform properties. Also contains convenience functions for +// checking/calculating launch dimensionality based on device properties. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ + +#include <map> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace internal { +class DeviceDescriptionBuilder; +} // namespace internal + +// Data that describes the execution target of the StreamExecutor, in terms of +// important logical parameters. These include dimensionality limits and +// physical parameters of interest, such as number of cores present on the +// device. +// +// Thread-safe: immutable post-initialization. +class DeviceDescription { + public: + // Returns the platform being run on; this value is primarily intended for + // printing, and comes out something like "OpenCL 1.2" or "Compute Capability + // 3.5". + const string &platform_version() const { return platform_version_; } + + // Returns the driver version interfacing with the underlying platform. Vendor + // dependent format. + const string &driver_version() const { return driver_version_; } + + // Return the runtime version, if one is provided by the underlying platform. + // Vendor dependent format / usefulness. + const string &runtime_version() const { return runtime_version_; } + + // Returns the name that the device reports. Vendor dependent. + const string &name() const { return name_; } + + // Returns the PCI bus identifier for this device, of the form + // [domain]:[bus]:[device].[function] + const string &pci_bus_id() const { return pci_bus_id_; } + + // Returns the NUMA node associated with this device, for use in + // determining socket locality. If the NUMA node could not be determined, -1 + // is returned. + int numa_node() const { return numa_node_; } + + // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device + // or an AMD Compute Unit. + int core_count() const { return core_count_; } + + // Returns the limit on the thread dimensionality values in each of the + // respective dimensions. These limits affect what constitutes a legitimate + // kernel launch request. + const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } + + // Returns the limit on the block dimensionality values in each of the + // respective dimensions. These limits may affect what constitutes a + // legitimate kernel launch request. + const BlockDim &block_dim_limit() const { return block_dim_limit_; } + + // Returns the limit on the number of simultaneously resident blocks + // on a multiprocessor. + const uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; } + + // Returns the limit on the total number of threads that can be launched in a + // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. + // This limit affects what constitutes a legitimate kernel launch request. + const uint64 &threads_per_block_limit() const { + return threads_per_block_limit_; + } + + // Returns the limit on the total number of threads that can be simultaneously + // launched on a given multiprocessor. + const uint64 &threads_per_core_limit() const { + return threads_per_core_limit_; + } + + // Returns the number of threads per warp/wavefront. + const uint64 &threads_per_warp() const { return threads_per_warp_; } + + // Returns the limit on the total number of registers per core. + const uint64 ®isters_per_core_limit() const { + return registers_per_core_limit_; + } + + // Returns the limit on the total number of registers that can be + // simultaneously used by a block. + const uint64 ®isters_per_block_limit() const { + return registers_per_block_limit_; + } + + // Returns the limit on the total number of registers that can be + // allocated to a thread. + const uint64 ®isters_per_thread_limit() const { + return registers_per_thread_limit_; + } + + // Returns the granularity at which warps are allocated resources. + const uint64 &warp_alloc_granularity() const { + return warp_alloc_granularity_; + } + + // Returns the granularity at which registers are allocated to warps. + const uint64 ®ister_alloc_granularity() const { + return register_alloc_granularity_; + } + + // Returns the granularity at which shared memory is allocated to warps. + const uint64 &shared_memory_alloc_granularity() const { + return shared_memory_alloc_granularity_; + } + + // Returns the number of address bits available to kernel code running on the + // platform. This affects things like the maximum allocation size and perhaps + // types used in kernel code such as size_t. + const uint64 &device_address_bits() const { return device_address_bits_; } + + // Returns the device memory size in bytes. + uint64 device_memory_size() const { return device_memory_size_; } + + // Returns the device's core clock rate in GHz. + const float clock_rate_ghz() const { return clock_rate_ghz_; } + + // Returns whether ECC is enabled. + bool ecc_enabled() const { return ecc_enabled_; } + + // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced + // Micro Devices, Inc.", or "GenuineIntel". + const string &device_vendor() const { return device_vendor_; } + + // Returns the CUDA compute capability if we're running on the CUDA platform. + // If a CUDA compute capability is not available, the major version will be + // zero, and the return value will be false. + bool cuda_compute_capability(int *major, int *minor) const; + + // Returns the maximum amount of shared memory present on a single core + // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL + // devices). Note that some devices, such as NVIDIA's have a configurable + // partitioning between shared memory and L1 cache. + uint64 shared_memory_per_core() const { return shared_memory_per_core_; } + + // Returns the maximum amount of shared memory available for a single block. + uint64 shared_memory_per_block() const { return shared_memory_per_block_; } + + // TODO(leary): resident blocks per core will be useful. + + // Convenience typedef for the string-based DeviceDescription mapping. + typedef std::map<string, string> Map; + + // Returns a mapping from readable names to readable values that describe the + // device. This is useful for things like printing. + std::unique_ptr<Map> ToMap() const; + + // For string values that are not available via the underlying platform, this + // value will be provided. + static const char *kUndefinedString; + + private: + friend class internal::DeviceDescriptionBuilder; + + DeviceDescription(); + + // For description of the following members, see the corresponding accessor + // above. + // + // N.B. If another field is added, update ToMap() above. + string device_vendor_; + string platform_version_; + string driver_version_; + string runtime_version_; + string pci_bus_id_; + string name_; + + ThreadDim thread_dim_limit_; + BlockDim block_dim_limit_; + + uint64 blocks_per_core_limit_; + + uint64 threads_per_core_limit_; + uint64 threads_per_block_limit_; + uint64 threads_per_warp_; + + uint64 registers_per_core_limit_; + uint64 registers_per_block_limit_; + uint64 registers_per_thread_limit_; + + uint64 warp_alloc_granularity_; + uint64 register_alloc_granularity_; + uint64 shared_memory_alloc_granularity_; + + uint64 device_address_bits_; + uint64 device_memory_size_; + + // Shared memory limits on a given device. + uint64 shared_memory_per_core_; + uint64 shared_memory_per_block_; + + float clock_rate_ghz_; + + // CUDA "CC" major value, -1 if not available. + int cuda_compute_capability_major_; + int cuda_compute_capability_minor_; + + int numa_node_; + int core_count_; + bool ecc_enabled_; + + SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); +}; + +namespace internal { + +// Helper class the builds a device description, given that it has a large +// number of fields that would be easily confused in constructor form. +class DeviceDescriptionBuilder { + public: + DeviceDescriptionBuilder(); + + // For descriptions of the following fields, see comments on the corresponding + // DeviceDescription::* accessors above. + + void set_device_vendor(const string &value) { + device_description_->device_vendor_ = value; + } + void set_platform_version(const string &value) { + device_description_->platform_version_ = value; + } + void set_driver_version(const string &value) { + device_description_->driver_version_ = value; + } + void set_runtime_version(const string &value) { + device_description_->runtime_version_ = value; + } + void set_pci_bus_id(const string &value) { + device_description_->pci_bus_id_ = value; + } + void set_name(const string &value) { device_description_->name_ = value; } + + void set_thread_dim_limit(const ThreadDim &value) { + device_description_->thread_dim_limit_ = value; + } + void set_block_dim_limit(const BlockDim &value) { + device_description_->block_dim_limit_ = value; + } + + void set_blocks_per_core_limit(uint64 value) { + device_description_->blocks_per_core_limit_ = value; + } + + void set_threads_per_core_limit(uint64 value) { + device_description_->threads_per_core_limit_ = value; + } + void set_threads_per_block_limit(uint64 value) { + device_description_->threads_per_block_limit_ = value; + } + void set_threads_per_warp(uint64 value) { + device_description_->threads_per_warp_ = value; + } + + void set_registers_per_core_limit(uint64 value) { + device_description_->registers_per_core_limit_ = value; + } + void set_registers_per_block_limit(uint64 value) { + device_description_->registers_per_block_limit_ = value; + } + void set_registers_per_thread_limit(uint64 value) { + device_description_->registers_per_thread_limit_ = value; + } + + void set_warp_alloc_granularity(uint64 value) { + device_description_->warp_alloc_granularity_ = value; + } + void set_register_alloc_granularity(uint64 value) { + device_description_->register_alloc_granularity_ = value; + } + void set_shared_memory_alloc_granularity(uint64 value) { + device_description_->shared_memory_alloc_granularity_ = value; + } + + void set_device_address_bits(uint64 value) { + device_description_->device_address_bits_ = value; + } + void set_device_memory_size(uint64 value) { + device_description_->device_memory_size_ = value; + } + + void set_shared_memory_per_core(int64 value) { + device_description_->shared_memory_per_core_ = value; + } + void set_shared_memory_per_block(int64 value) { + device_description_->shared_memory_per_block_ = value; + } + + void set_clock_rate_ghz(float value) { + device_description_->clock_rate_ghz_ = value; + } + + void set_cuda_compute_capability(int major, int minor) { + device_description_->cuda_compute_capability_major_ = major; + device_description_->cuda_compute_capability_minor_ = minor; + } + + void set_numa_node(int value) { device_description_->numa_node_ = value; } + void set_core_count(int value) { device_description_->core_count_ = value; } + void set_ecc_enabled(bool value) { + device_description_->ecc_enabled_ = value; + } + + // Returns a built DeviceDescription with ownership transferred to the + // caller. There are currently no restrictions on which fields must be set in + // order to build the descriptor. + // + // Once the description is built, this builder object should be discarded. + std::unique_ptr<DeviceDescription> Build() { + return std::move(device_description_); + } + + private: + std::unique_ptr<DeviceDescription> device_description_; + + SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); +}; + +} // namespace internal + +// Returns whether the given thread_dim is acceptable given the limits described +// in device_description. For detailed reasons for failing the predicate, enable +// VLOG(2) for this module. +bool ThreadDimOk(const DeviceDescription &device_description, + const ThreadDim &thread_dim); + +// [deprecated] Use MathUtil::CeilOfRatio directly instead. +// +// Equivalent to ceil(double(element_count) / threads_per_block). +uint64 DivideCeil(uint64 x, uint64 y); + +// Calculate the number of threads/blocks required to process element_count +// elements. Note that you can still end up with more threads than +// element_count due to rounding, so kernels often start with an "is this +// thread id in the element_count range?" test. +void CalculateDimensionality(const DeviceDescription &device_description, + uint64 element_count, uint64 *threads_per_block, + uint64 *block_count); + +// Compute and return maximum blocks per core (occupancy) based on the +// device description, some kernel characteristics and the number of threads per +// block. If unable to compute occupancy, zero is returned. +uint64 CalculateOccupancy(const DeviceDescription &device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim &thread_dims); + +// Compute and return the maximum number of registers per thread which +// achieves the target occupancy. If the target is not possible then +// zero is returned. +uint64 CalculateRegisterLimitForTargetOccupancy( + const DeviceDescription &device_description, uint64 shared_memory_per_block, + const ThreadDim &thread_dims, uint64 target_blocks_per_core); + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ |