aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Matt Conley <mconley@nvidia.com>2018-09-04 14:20:40 -0700
committerGravatar Matt Conley <mconley@nvidia.com>2018-09-04 14:20:40 -0700
commitfa20b59b920233d35bb8da3fbc3c234c369a8291 (patch)
tree76c388948c0cc638275513b32cdbd505ccd574ce /tensorflow/stream_executor
parente93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 (diff)
Move CUDA-specific occupancy calculation into proper file
-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor -Remove CUDA requirement in general class device_description
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc37
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h11
-rw-r--r--tensorflow/stream_executor/device_description.cc32
-rw-r--r--tensorflow/stream_executor/device_description.h17
4 files changed, 48 insertions, 49 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 39b0696c93..458c0e3030 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
}
}
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block. If unable to compute occupancy, zero is returned.
+int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func) {
+ int suggested_blocks = 0;
+ int suggested_threads = 0;
+ CUresult err =
+ cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+ func, NULL, shared_memory_per_block, 0);
+ CHECK_EQ(err, CUDA_SUCCESS);
+ if (suggested_blocks > *initial_blocks) {
+ *initial_blocks = suggested_blocks;
+ return suggested_threads;
+ } else {
+ return 0;
+ }
+}
+
void *CUDAExecutor::Allocate(uint64 size) {
return CUDADriver::DeviceAllocate(context_, size);
}
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8a954d5461..e8ebbc3220 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
+ int CalculateOccupancy(const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
+
+ int CompareOccupancy(int* initial_blocks,
+ const DeviceDescription& device_description,
+ uint64 registers_per_thread,
+ uint64 shared_memory_per_block,
+ const ThreadDim& thread_dims, CUfunction func);
+
void *Allocate(uint64 size) override;
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index df52ce6cce..726c4adf74 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) {
return port::MathUtil::FloorOfRatio(value, n) * n;
}
-int CalculateOccupancy(const DeviceDescription& device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func) {
- int suggested_blocks = 0;
- int suggested_threads = 0;
- CUresult err =
- cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
- func, NULL, shared_memory_per_block, 0);
- CHECK_EQ(err, CUDA_SUCCESS);
- return suggested_blocks;
-}
-
-int CompareOccupancy(int* initial_blocks,
- const DeviceDescription& device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func) {
- int suggested_blocks = 0;
- int suggested_threads = 0;
- CUresult err =
- cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
- func, NULL, shared_memory_per_block, 0);
- CHECK_EQ(err, CUDA_SUCCESS);
- if (suggested_blocks > *initial_blocks) {
- *initial_blocks = suggested_blocks;
- return suggested_threads;
- } else {
- return 0;
- }
-}
-
} // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index d335b9b875..b15ce31216 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -24,7 +24,6 @@ limitations under the License.
#include <memory>
#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/launch_dim.h"
#include "tensorflow/stream_executor/platform/port.h"
@@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description,
uint64 element_count, uint64 *threads_per_block,
uint64 *block_count);
-// Compute and return maximum blocks per core (occupancy) based on the
-// device description, some kernel characteristics and the number of threads per
-// block. If unable to compute occupancy, zero is returned.
-int CalculateOccupancy(const DeviceDescription& device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func);
-
-// Compute and return the suggested thread count to acheive ideal occupancy.
-// If the provided thread dimensions match this number, zero is returned.
-int CompareOccupancy(int* initial_blocks,
- const DeviceDescription& device_description,
- uint64 registers_per_thread,
- uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func);
-
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_