aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-01 21:18:17 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-01 21:18:17 -0700
commit6161d8cc4d66b87ba198cb6a16d83ce317c77eff (patch)
tree47b45fcd51a5bf0001afd30295fc6e236d563a4e /tensorflow/stream_executor
parent350388fca9cb9509962ff393a9d21fb2879c9179 (diff)
parent6a5090b086bc9d665eb9e65f05eb94cdb58baaa2 (diff)
Merge pull request #21958 from MattConley:CudaOccupancy
PiperOrigin-RevId: 215331087
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc222
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h11
-rw-r--r--tensorflow/stream_executor/device_description.cc76
-rw-r--r--tensorflow/stream_executor/device_description.h64
4 files changed, 63 insertions, 310 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index e30f50ea2a..5cceb8983c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -470,30 +470,59 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
const DeviceDescription &device_description =
kernel.parent()->GetDeviceDescription();
- uint64 blocks_per_sm = CalculateOccupancy(
- device_description, regs_per_thread, smem_per_block, thread_dims);
+ const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
+ CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+
+ int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
+ smem_per_block, thread_dims, cufunc);
VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
- // To increase occupancy, there must be a sufficient number of blocks
- // available to spread across the sm's at this new improved occupancy level.
- int multiprocessor_count = device_description.core_count();
- int block_count = block_dims.x * block_dims.y * block_dims.z;
- int available_blocks_per_sm =
- port::MathUtil::CeilOfRatio(block_count, multiprocessor_count);
- if (available_blocks_per_sm <= static_cast<int64>(blocks_per_sm)) {
- VLOG(2) << "Occupancy is limited by number of blocks available per sm.";
- return;
+ int suggested_threads =
+ CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
+ smem_per_block, thread_dims, cufunc);
+ if (suggested_threads != 0) {
+ VLOG(2) << "The cuda occupancy calculator recommends using "
+ << suggested_threads
+ << " threads per block to achieve an occupancy of " << blocks_per_sm
+ << " blocks per SM.";
}
+}
- uint64 improved_regs_per_thread = CalculateRegisterLimitForTargetOccupancy(
- device_description, smem_per_block, thread_dims, blocks_per_sm + 1);
- if (improved_regs_per_thread != 0) {
- VLOG(2) << "Reducing register usage from " << regs_per_thread
- << " to " << improved_regs_per_thread
- << " could increase resident blocks per SM by one.";
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block. If unable to compute occupancy, zero is returned.
+int CUDAExecutor::CalculateOccupancy(
+ const DeviceDescription &device_description, uint64 registers_per_thread,
+ uint64 shared_memory_per_block, const ThreadDim &thread_dims,
+ CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err = cuOccupancyMaxPotentialBlockSize(
+ &suggested_blocks, &suggested_threads, func, nullptr,
+ shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to achieve ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CUDAExecutor::CompareOccupancy(int *initial_blocks,
+ const DeviceDescription &device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim &thread_dims,
+ CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err = cuOccupancyMaxPotentialBlockSize(
+ &suggested_blocks, &suggested_threads, func, nullptr,
+ shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ if (suggested_blocks > *initial_blocks) {
+ *initial_blocks = suggested_blocks;
+ return suggested_threads;
} else {
- VLOG(2) << "Resident blocks per SM cannot be increased by reducing "
- "register usage.";
+ return 0;
}
}
@@ -980,144 +1009,6 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
#endif
}
-// Set of compute capability specific device parameters that cannot be
-// queried from the driver API. These values instead are baked into a
-// lookup table indexed by compute capability version.
-struct UnqueryableDeviceParams {
- int cc_major;
- int cc_minor;
- uint64 blocks_per_core_limit;
- uint64 registers_per_core_limit;
- uint64 registers_per_thread_limit;
- uint64 warp_alloc_granularity;
- uint64 register_alloc_granularity;
- uint64 shared_memory_alloc_granularity;
-};
-
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-// https://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
-static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
- {
- 2, 0, // compute capability (2.0)
- 8, // blocks_per_core_limit
- 32 * 1024, // registers_per_core_limit
- 63, // registers_per_thread_limit
- 2, // warp_alloc_granularity
- 64, // register_alloc_granularity
- 128, // shared_memory_alloc_granularity
- },
- {
- 2, 1, // compute capability (2.1)
- 8, // blocks_per_core_limit
- 32 * 1024, // registers_per_core_limit
- 63, // registers_per_thread_limit
- 2, // warp_alloc_granularity
- 64, // register_alloc_granularity
- 128, // shared_memory_alloc_granularity
- },
- {
- 3, 0, // compute capability (3.0)
- 16, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 63, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 3, 2, // compute capability (3.2)
- 16, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 3, 5, // compute capability (3.5)
- 16, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 3, 7, // compute capability (3.7)
- 16, // blocks_per_core_limit
- 128 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 5, 0, // compute capability (5.0)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 5, 2, // compute capability (5.2)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 5, 3, // compute capability (5.3)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 6, 0, // compute capability (6.0)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 2, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 6, 1, // compute capability (6.1)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- {
- 6, 2, // compute capability (6.2)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 4, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
- // TODO(jlebar): Confirm the alloc granularity values for sm_70. These are
- // not published in the spreadsheet linked above. Currently we guess that
- // they're the same as sm_60.
- {
- 7, 0, // compute capability (7.0)
- 32, // blocks_per_core_limit
- 64 * 1024, // registers_per_core_limit
- 255, // registers_per_thread_limit
- 2, // warp_alloc_granularity
- 256, // register_alloc_granularity
- 256, // shared_memory_alloc_granularity
- },
-};
DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
internal::DeviceDescriptionBuilder builder;
@@ -1193,19 +1084,6 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
builder.set_name(device_name);
}
- for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
- const auto &params = kAllUnqueryableDeviceParams[i];
- if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
- builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
- builder.set_registers_per_core_limit(params.registers_per_core_limit);
- builder.set_registers_per_thread_limit(params.registers_per_thread_limit);
- builder.set_warp_alloc_granularity(params.warp_alloc_granularity);
- builder.set_register_alloc_granularity(params.register_alloc_granularity);
- builder.set_shared_memory_alloc_granularity(
- params.shared_memory_alloc_granularity);
- }
- }
-
builder.set_platform_version(
port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
@@ -1227,6 +1105,10 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
builder.set_threads_per_warp(
CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+ builder.set_registers_per_core_limit(
+ CUDADriver::GetDeviceAttribute(
+ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
+ .ValueOrDie());
auto built = builder.Build();
return built.release();
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8a954d5461..53b2a29ae7 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
+ int CalculateOccupancy(const DeviceDescription &device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim &thread_dims, CUfunction func);
+
+ int CompareOccupancy(int *initial_blocks,
+ const DeviceDescription &device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim &thread_dims, CUfunction func);
+
void *Allocate(uint64 size) override;
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 8ca0677f8a..726c4adf74 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -37,16 +37,11 @@ DeviceDescription::DeviceDescription()
kUninitializedUint64),
block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
kUninitializedUint64),
- blocks_per_core_limit_(kUninitializedUint64),
threads_per_core_limit_(kUninitializedUint64),
threads_per_block_limit_(kUninitializedUint64),
threads_per_warp_(kUninitializedUint64),
registers_per_core_limit_(kUninitializedUint64),
registers_per_block_limit_(kUninitializedUint64),
- registers_per_thread_limit_(kUninitializedUint64),
- warp_alloc_granularity_(1),
- register_alloc_granularity_(1),
- shared_memory_alloc_granularity_(1),
device_address_bits_(kUninitializedUint64),
device_memory_size_(kUninitializedUint64),
memory_bandwidth_(kUninitializedUint64),
@@ -162,75 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) {
return port::MathUtil::FloorOfRatio(value, n) * n;
}
-uint64 CalculateOccupancy(const DeviceDescription &device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim &thread_dims) {
- // Don't try to compute occupancy if necessary values are not initialized.
- uint64 required_fields[] = { device_description.registers_per_thread_limit(),
- device_description.threads_per_warp(),
- device_description.warp_alloc_granularity(),
- device_description.register_alloc_granularity(),
- device_description.registers_per_block_limit(),
- device_description.shared_memory_per_core(),
- device_description.blocks_per_core_limit() };
- for (auto value : required_fields) {
- if (value == kUninitializedUint64) {
- return 0;
- }
- }
-
- if (registers_per_thread > device_description.registers_per_thread_limit()) {
- return 0;
- }
-
- uint64 warps_per_block =
- port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z,
- device_description.threads_per_warp());
-
- // Warp resources are allocated at a particular granularity. This value is
- // the effective number of warps for resource allocation purposes.
- uint64 alloc_warps_per_block =
- RoundUp(warps_per_block, device_description.warp_alloc_granularity());
-
- uint64 alloc_regs_per_warp =
- RoundUp(device_description.threads_per_warp() * registers_per_thread,
- device_description.register_alloc_granularity());
- uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp;
- uint64 reg_limit =
- device_description.registers_per_block_limit() / regs_per_block;
-
- uint64 alloc_smem_per_block = RoundUp(
- shared_memory_per_block,
- device_description.shared_memory_alloc_granularity());
- uint64 smem_limit = alloc_smem_per_block > 0 ?
- device_description.shared_memory_per_core() / alloc_smem_per_block :
- device_description.blocks_per_core_limit();
-
- uint64 thread_limit = device_description.threads_per_core_limit()
- / (warps_per_block * device_description.threads_per_warp());
-
- return std::min({ device_description.blocks_per_core_limit(),
- reg_limit, smem_limit, thread_limit });
-}
-
-uint64 CalculateRegisterLimitForTargetOccupancy(
- const DeviceDescription &device_description, uint64 shared_memory_per_block,
- const ThreadDim &thread_dims, uint64 target_blocks_per_core) {
- // Linear search from maximum number of registers down until the target
- // blocks per SM is found.
- // TODO(meheff): Compute this using a closed form solution.
- int reg_step = device_description.register_alloc_granularity() /
- device_description.threads_per_warp();
- for (int r = device_description.registers_per_thread_limit(); r > 0;
- r = RoundDown(r - 1, reg_step)) {
- uint64 occupancy = CalculateOccupancy(
- device_description, r, shared_memory_per_block, thread_dims);
- if (occupancy >= target_blocks_per_core) {
- return r;
- }
- }
- return 0;
-}
-
} // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index a4580d6462..8ddf18629d 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -78,10 +78,6 @@ class DeviceDescription {
// legitimate kernel launch request.
const BlockDim &block_dim_limit() const { return block_dim_limit_; }
- // Returns the limit on the number of simultaneously resident blocks
- // on a multiprocessor.
- uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
-
// Returns the limit on the total number of threads that can be launched in a
// single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
// This limit affects what constitutes a legitimate kernel launch request.
@@ -109,27 +105,6 @@ class DeviceDescription {
return registers_per_block_limit_;
}
- // Returns the limit on the total number of registers that can be
- // allocated to a thread.
- const uint64 &registers_per_thread_limit() const {
- return registers_per_thread_limit_;
- }
-
- // Returns the granularity at which warps are allocated resources.
- const uint64 &warp_alloc_granularity() const {
- return warp_alloc_granularity_;
- }
-
- // Returns the granularity at which registers are allocated to warps.
- const uint64 &register_alloc_granularity() const {
- return register_alloc_granularity_;
- }
-
- // Returns the granularity at which shared memory is allocated to warps.
- const uint64 &shared_memory_alloc_granularity() const {
- return shared_memory_alloc_granularity_;
- }
-
// Returns the number of address bits available to kernel code running on the
// platform. This affects things like the maximum allocation size and perhaps
// types used in kernel code such as size_t.
@@ -199,19 +174,12 @@ class DeviceDescription {
ThreadDim thread_dim_limit_;
BlockDim block_dim_limit_;
- uint64 blocks_per_core_limit_;
-
uint64 threads_per_core_limit_;
uint64 threads_per_block_limit_;
uint64 threads_per_warp_;
uint64 registers_per_core_limit_;
uint64 registers_per_block_limit_;
- uint64 registers_per_thread_limit_;
-
- uint64 warp_alloc_granularity_;
- uint64 register_alloc_granularity_;
- uint64 shared_memory_alloc_granularity_;
uint64 device_address_bits_;
uint64 device_memory_size_;
@@ -269,10 +237,6 @@ class DeviceDescriptionBuilder {
device_description_->block_dim_limit_ = value;
}
- void set_blocks_per_core_limit(uint64 value) {
- device_description_->blocks_per_core_limit_ = value;
- }
-
void set_threads_per_core_limit(uint64 value) {
device_description_->threads_per_core_limit_ = value;
}
@@ -289,19 +253,6 @@ class DeviceDescriptionBuilder {
void set_registers_per_block_limit(uint64 value) {
device_description_->registers_per_block_limit_ = value;
}
- void set_registers_per_thread_limit(uint64 value) {
- device_description_->registers_per_thread_limit_ = value;
- }
-
- void set_warp_alloc_granularity(uint64 value) {
- device_description_->warp_alloc_granularity_ = value;
- }
- void set_register_alloc_granularity(uint64 value) {
- device_description_->register_alloc_granularity_ = value;
- }
- void set_shared_memory_alloc_granularity(uint64 value) {
- device_description_->shared_memory_alloc_granularity_ = value;
- }
void set_device_address_bits(uint64 value) {
device_description_->device_address_bits_ = value;
@@ -370,21 +321,6 @@ void CalculateDimensionality(const DeviceDescription &device_description,
uint64 element_count, uint64 *threads_per_block,
uint64 *block_count);
-// Compute and return maximum blocks per core (occupancy) based on the
-// device description, some kernel characteristics and the number of threads per
-// block. If unable to compute occupancy, zero is returned.
-uint64 CalculateOccupancy(const DeviceDescription &device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim &thread_dims);
-
-// Compute and return the maximum number of registers per thread which
-// achieves the target occupancy. If the target is not possible then
-// zero is returned.
-uint64 CalculateRegisterLimitForTargetOccupancy(
- const DeviceDescription &device_description, uint64 shared_memory_per_block,
- const ThreadDim &thread_dims, uint64 target_blocks_per_core);
-
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_