From fa20b59b920233d35bb8da3fbc3c234c369a8291 Mon Sep 17 00:00:00 2001 From: Matt Conley Date: Tue, 4 Sep 2018 14:20:40 -0700 Subject: Move CUDA-specific occupancy calculation into proper file -Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor -Remove CUDA requirement in general class device_description --- .../stream_executor/cuda/cuda_gpu_executor.cc | 37 ++++++++++++++++++++++ .../stream_executor/cuda/cuda_gpu_executor.h | 11 +++++++ tensorflow/stream_executor/device_description.cc | 32 ------------------- tensorflow/stream_executor/device_description.h | 17 ---------- 4 files changed, 48 insertions(+), 49 deletions(-) (limited to 'tensorflow/stream_executor') diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 39b0696c93..458c0e3030 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel, } } +// Compute and return maximum blocks per core (occupancy) based on the +// device description, some kernel characteristics and the number of threads per +// block. If unable to compute occupancy, zero is returned. +int CalculateOccupancy(const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func) { + int suggested_blocks = 0; + int suggested_threads = 0; + CUresult err = + cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, + func, NULL, shared_memory_per_block, 0); + CHECK_EQ(err, CUDA_SUCCESS); + return suggested_blocks; +} + +// Compute and return the suggested thread count to acheive ideal occupancy. +// If the provided thread dimensions match this number, zero is returned. +int CompareOccupancy(int* initial_blocks, + const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func) { + int suggested_blocks = 0; + int suggested_threads = 0; + CUresult err = + cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, + func, NULL, shared_memory_per_block, 0); + CHECK_EQ(err, CUDA_SUCCESS); + if (suggested_blocks > *initial_blocks) { + *initial_blocks = suggested_blocks; + return suggested_threads; + } else { + return 0; + } +} + void *CUDAExecutor::Allocate(uint64 size) { return CUDADriver::DeviceAllocate(context_, size); } diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index 8a954d5461..e8ebbc3220 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface { const BlockDim &block_dims, const KernelBase &k, const KernelArgsArrayBase &args) override; + int CalculateOccupancy(const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func); + + int CompareOccupancy(int* initial_blocks, + const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func); + void *Allocate(uint64 size) override; void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc index df52ce6cce..726c4adf74 100644 --- a/tensorflow/stream_executor/device_description.cc +++ b/tensorflow/stream_executor/device_description.cc @@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) { return port::MathUtil::FloorOfRatio(value, n) * n; } -int CalculateOccupancy(const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func) { - int suggested_blocks = 0; - int suggested_threads = 0; - CUresult err = - cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, - func, NULL, shared_memory_per_block, 0); - CHECK_EQ(err, CUDA_SUCCESS); - return suggested_blocks; -} - -int CompareOccupancy(int* initial_blocks, - const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func) { - int suggested_blocks = 0; - int suggested_threads = 0; - CUresult err = - cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, - func, NULL, shared_memory_per_block, 0); - CHECK_EQ(err, CUDA_SUCCESS); - if (suggested_blocks > *initial_blocks) { - *initial_blocks = suggested_blocks; - return suggested_threads; - } else { - return 0; - } -} - } // namespace stream_executor diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h index d335b9b875..b15ce31216 100644 --- a/tensorflow/stream_executor/device_description.h +++ b/tensorflow/stream_executor/device_description.h @@ -24,7 +24,6 @@ limitations under the License. #include #include "tensorflow/stream_executor/platform/port.h" -#include "tensorflow/stream_executor/cuda/cuda_driver.h" #include "tensorflow/stream_executor/launch_dim.h" #include "tensorflow/stream_executor/platform/port.h" @@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description, uint64 element_count, uint64 *threads_per_block, uint64 *block_count); -// Compute and return maximum blocks per core (occupancy) based on the -// device description, some kernel characteristics and the number of threads per -// block. If unable to compute occupancy, zero is returned. -int CalculateOccupancy(const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func); - -// Compute and return the suggested thread count to acheive ideal occupancy. -// If the provided thread dimensions match this number, zero is returned. -int CompareOccupancy(int* initial_blocks, - const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func); - } // namespace stream_executor #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ -- cgit v1.2.3