aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_gpu_executor.cc')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc37
1 files changed, 37 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 39b0696c93..458c0e3030 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
}
}
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block. If unable to compute occupancy, zero is returned.
+int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ if (suggested_blocks > *initial_blocks) {
+ *initial_blocks = suggested_blocks;
+ return suggested_threads;
+ } else {
+ return 0;
+ }
+}
+
void *CUDAExecutor::Allocate(uint64 size) {
return CUDADriver::DeviceAllocate(context_, size);
}