aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/device_description.h
diff options
context:
space:
mode:
authorGravatar Matt Conley <mconley@nvidia.com>2018-08-28 18:55:51 -0700
committerGravatar Matt Conley <mconley@nvidia.com>2018-08-28 18:55:51 -0700
commite93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (patch)
tree64911ea09beae2cd57365da73ca03c3d805665db /tensorflow/stream_executor/device_description.h
parent2e7352e57c541908cd700bb0fe53a04b456392c9 (diff)
Update GPU occupancy checking to utilize CUDA's occupancy calculator functions
-Replace references to the UnqueryableDeviceParams struct with calls to CUDA's built-in occupancy calculation functions -Update calls to the occupancy checking functions with the new changes -Changes should provide more long-term reliability and will remove the need to manually update hardcoded data values for new GPU architectures
Diffstat (limited to 'tensorflow/stream_executor/device_description.h')
-rw-r--r--tensorflow/stream_executor/device_description.h73
1 files changed, 13 insertions, 60 deletions
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 7f99d81ef3..d335b9b875 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -24,6 +24,7 @@ limitations under the License.
#include <memory>
#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/launch_dim.h"
#include "tensorflow/stream_executor/platform/port.h"
@@ -79,10 +80,6 @@ class DeviceDescription {
// legitimate kernel launch request.
const BlockDim &block_dim_limit() const { return block_dim_limit_; }
- // Returns the limit on the number of simultaneously resident blocks
- // on a multiprocessor.
- uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
-
// Returns the limit on the total number of threads that can be launched in a
// single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
// This limit affects what constitutes a legitimate kernel launch request.
@@ -110,27 +107,6 @@ class DeviceDescription {
return registers_per_block_limit_;
}
- // Returns the limit on the total number of registers that can be
- // allocated to a thread.
- const uint64 &registers_per_thread_limit() const {
- return registers_per_thread_limit_;
- }
-
- // Returns the granularity at which warps are allocated resources.
- const uint64 &warp_alloc_granularity() const {
- return warp_alloc_granularity_;
- }
-
- // Returns the granularity at which registers are allocated to warps.
- const uint64 &register_alloc_granularity() const {
- return register_alloc_granularity_;
- }
-
- // Returns the granularity at which shared memory is allocated to warps.
- const uint64 &shared_memory_alloc_granularity() const {
- return shared_memory_alloc_granularity_;
- }
-
// Returns the number of address bits available to kernel code running on the
// platform. This affects things like the maximum allocation size and perhaps
// types used in kernel code such as size_t.
@@ -200,19 +176,12 @@ class DeviceDescription {
ThreadDim thread_dim_limit_;
BlockDim block_dim_limit_;
- uint64 blocks_per_core_limit_;
-
uint64 threads_per_core_limit_;
uint64 threads_per_block_limit_;
uint64 threads_per_warp_;
uint64 registers_per_core_limit_;
uint64 registers_per_block_limit_;
- uint64 registers_per_thread_limit_;
-
- uint64 warp_alloc_granularity_;
- uint64 register_alloc_granularity_;
- uint64 shared_memory_alloc_granularity_;
uint64 device_address_bits_;
uint64 device_memory_size_;
@@ -270,10 +239,6 @@ class DeviceDescriptionBuilder {
device_description_->block_dim_limit_ = value;
}
- void set_blocks_per_core_limit(uint64 value) {
- device_description_->blocks_per_core_limit_ = value;
- }
-
void set_threads_per_core_limit(uint64 value) {
device_description_->threads_per_core_limit_ = value;
}
@@ -290,19 +255,6 @@ class DeviceDescriptionBuilder {
void set_registers_per_block_limit(uint64 value) {
device_description_->registers_per_block_limit_ = value;
}
- void set_registers_per_thread_limit(uint64 value) {
- device_description_->registers_per_thread_limit_ = value;
- }
-
- void set_warp_alloc_granularity(uint64 value) {
- device_description_->warp_alloc_granularity_ = value;
- }
- void set_register_alloc_granularity(uint64 value) {
- device_description_->register_alloc_granularity_ = value;
- }
- void set_shared_memory_alloc_granularity(uint64 value) {
- device_description_->shared_memory_alloc_granularity_ = value;
- }
void set_device_address_bits(uint64 value) {
device_description_->device_address_bits_ = value;
@@ -375,17 +327,18 @@ void CalculateDimensionality(const DeviceDescription &device_description,
// Compute and return maximum blocks per core (occupancy) based on the
// device description, some kernel characteristics and the number of threads per
// block. If unable to compute occupancy, zero is returned.
-uint64 CalculateOccupancy(const DeviceDescription &device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim &thread_dims);
-
-// Compute and return the maximum number of registers per thread which
-// achieves the target occupancy. If the target is not possible then
-// zero is returned.
-uint64 CalculateRegisterLimitForTargetOccupancy(
- const DeviceDescription &device_description, uint64 shared_memory_per_block,
- const ThreadDim &thread_dims, uint64 target_blocks_per_core);
+int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
} // namespace stream_executor