blob: 09304af13c0832d167e9ad8a68aa1f8eb952c5ef (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
#if GOOGLE_CUDA
#include <algorithm>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
struct CudaLaunchConfig {
// Logical number of thread that works on the elements. If each logic thread
// works on exactly a single element, this is the same as the working element
// count.
int virtual_thread_count = -1;
// Number of threads per block.
int thread_per_block = -1;
// Number of blocks for Cuda kernel launch.
int block_count = -1;
};
// Calculate the Cuda launch config we should use for a kernel launch.
// This is assuming the kernel is quite simple and will largely be
// memory-limited.
inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
const GPUDevice& d) {
const int virtual_thread_count = work_element_count;
const int physical_thread_count = std::min(
d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
virtual_thread_count);
const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
const int block_count = std::min(
(physical_thread_count + thread_per_block - 1) / thread_per_block,
d.getNumCudaMultiProcessors());
CudaLaunchConfig config;
config.virtual_thread_count = virtual_thread_count;
config.thread_per_block = thread_per_block;
config.block_count = block_count;
return config;
}
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // THIRD_PARTY_TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
|