Move CUDA-specific occupancy calculation into proper file

-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor -Remove CUDA requirement in general class device_description
author: Matt Conley <mconley@nvidia.com> 2018-09-04 14:20:40 -0700
committer: Matt Conley <mconley@nvidia.com> 2018-09-04 14:20:40 -0700
commit: fa20b59b920233d35bb8da3fbc3c234c369a8291 (patch)
tree: 76c388948c0cc638275513b32cdbd505ccd574ce /tensorflow/stream_executor/cuda
parent: e93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (diff)
2 files changed, 48 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 39b0696c93..458c0e3030 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   }
 }
 
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block.  If unable to compute occupancy, zero is returned.
+int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  if (suggested_blocks > *initial_blocks) {
+    *initial_blocks = suggested_blocks;
+    return suggested_threads;
+  } else {
+    return 0;
+  }
+}
+
 void *CUDAExecutor::Allocate(uint64 size) {
   return CUDADriver::DeviceAllocate(context_, size);
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8a954d5461..e8ebbc3220 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
               const BlockDim &block_dims, const KernelBase &k,
               const KernelArgsArrayBase &args) override;
 
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func);
+
+  int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func);
+
   void *Allocate(uint64 size) override;
 
   void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
author	Matt Conley <mconley@nvidia.com>	2018-09-04 14:20:40 -0700
committer	Matt Conley <mconley@nvidia.com>	2018-09-04 14:20:40 -0700
commit	fa20b59b920233d35bb8da3fbc3c234c369a8291 (patch)
tree	76c388948c0cc638275513b32cdbd505ccd574ce /tensorflow/stream_executor/cuda
parent	e93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (diff)