diff options
author | Matt Conley <mconley@nvidia.com> | 2018-09-04 14:20:40 -0700 |
---|---|---|
committer | Matt Conley <mconley@nvidia.com> | 2018-09-04 14:20:40 -0700 |
commit | fa20b59b920233d35bb8da3fbc3c234c369a8291 (patch) | |
tree | 76c388948c0cc638275513b32cdbd505ccd574ce /tensorflow/stream_executor | |
parent | e93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (diff) |
Move CUDA-specific occupancy calculation into proper file
-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor
-Remove CUDA requirement in general class device_description
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 37 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.h | 11 | ||||
-rw-r--r-- | tensorflow/stream_executor/device_description.cc | 32 | ||||
-rw-r--r-- | tensorflow/stream_executor/device_description.h | 17 |
4 files changed, 48 insertions, 49 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 39b0696c93..458c0e3030 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel, } } +// Compute and return maximum blocks per core (occupancy) based on the +// device description, some kernel characteristics and the number of threads per +// block. If unable to compute occupancy, zero is returned. +int CalculateOccupancy(const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func) { + int suggested_blocks = 0; + int suggested_threads = 0; + CUresult err = + cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, + func, NULL, shared_memory_per_block, 0); + CHECK_EQ(err, CUDA_SUCCESS); + return suggested_blocks; +} + +// Compute and return the suggested thread count to acheive ideal occupancy. +// If the provided thread dimensions match this number, zero is returned. +int CompareOccupancy(int* initial_blocks, + const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func) { + int suggested_blocks = 0; + int suggested_threads = 0; + CUresult err = + cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, + func, NULL, shared_memory_per_block, 0); + CHECK_EQ(err, CUDA_SUCCESS); + if (suggested_blocks > *initial_blocks) { + *initial_blocks = suggested_blocks; + return suggested_threads; + } else { + return 0; + } +} + void *CUDAExecutor::Allocate(uint64 size) { return CUDADriver::DeviceAllocate(context_, size); } diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index 8a954d5461..e8ebbc3220 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface { const BlockDim &block_dims, const KernelBase &k, const KernelArgsArrayBase &args) override; + int CalculateOccupancy(const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func); + + int CompareOccupancy(int* initial_blocks, + const DeviceDescription& device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim& thread_dims, CUfunction func); + void *Allocate(uint64 size) override; void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc index df52ce6cce..726c4adf74 100644 --- a/tensorflow/stream_executor/device_description.cc +++ b/tensorflow/stream_executor/device_description.cc @@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) { return port::MathUtil::FloorOfRatio(value, n) * n; } -int CalculateOccupancy(const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func) { - int suggested_blocks = 0; - int suggested_threads = 0; - CUresult err = - cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, - func, NULL, shared_memory_per_block, 0); - CHECK_EQ(err, CUDA_SUCCESS); - return suggested_blocks; -} - -int CompareOccupancy(int* initial_blocks, - const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func) { - int suggested_blocks = 0; - int suggested_threads = 0; - CUresult err = - cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads, - func, NULL, shared_memory_per_block, 0); - CHECK_EQ(err, CUDA_SUCCESS); - if (suggested_blocks > *initial_blocks) { - *initial_blocks = suggested_blocks; - return suggested_threads; - } else { - return 0; - } -} - } // namespace stream_executor diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h index d335b9b875..b15ce31216 100644 --- a/tensorflow/stream_executor/device_description.h +++ b/tensorflow/stream_executor/device_description.h @@ -24,7 +24,6 @@ limitations under the License. #include <memory> #include "tensorflow/stream_executor/platform/port.h" -#include "tensorflow/stream_executor/cuda/cuda_driver.h" #include "tensorflow/stream_executor/launch_dim.h" #include "tensorflow/stream_executor/platform/port.h" @@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description, uint64 element_count, uint64 *threads_per_block, uint64 *block_count); -// Compute and return maximum blocks per core (occupancy) based on the -// device description, some kernel characteristics and the number of threads per -// block. If unable to compute occupancy, zero is returned. -int CalculateOccupancy(const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func); - -// Compute and return the suggested thread count to acheive ideal occupancy. -// If the provided thread dimensions match this number, zero is returned. -int CompareOccupancy(int* initial_blocks, - const DeviceDescription& device_description, - uint64 registers_per_thread, - uint64 shared_memory_per_block, - const ThreadDim& thread_dims, CUfunction func); - } // namespace stream_executor #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ |