aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda
diff options
context:
space:
mode:
authorGravatar Matt Conley <mconley@nvidia.com>2018-09-04 14:20:40 -0700
committerGravatar Matt Conley <mconley@nvidia.com>2018-09-04 14:20:40 -0700
commitfa20b59b920233d35bb8da3fbc3c234c369a8291 (patch)
tree76c388948c0cc638275513b32cdbd505ccd574ce /tensorflow/stream_executor/cuda
parente93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (diff)
Move CUDA-specific occupancy calculation into proper file
-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor -Remove CUDA requirement in general class device_description
Diffstat (limited to 'tensorflow/stream_executor/cuda')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc37
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h11
2 files changed, 48 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 39b0696c93..458c0e3030 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
}
}
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block. If unable to compute occupancy, zero is returned.
+int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ if (suggested_blocks > *initial_blocks) {
+ *initial_blocks = suggested_blocks;
+ return suggested_threads;
+ } else {
+ return 0;
+ }
+}
+
void *CUDAExecutor::Allocate(uint64 size) {
return CUDADriver::DeviceAllocate(context_, size);
}
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8a954d5461..e8ebbc3220 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
+ int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
+
+ int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
+
void *Allocate(uint64 size) override;
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,