aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc12
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h10
2 files changed, 11 insertions, 11 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ef84d01a94..9d5bcc7f77 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -472,7 +472,7 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
const DeviceDescription &device_description =
kernel.parent()->GetDeviceDescription();
- const CUDAKernel* cuda_kernel = AsCUDAKernel(&kernel);
+ const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
@@ -494,8 +494,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
// device description, some kernel characteristics and the number of threads per
// block. If unable to compute occupancy, zero is returned.
int CUDAExecutor::CalculateOccupancy(
- const DeviceDescription& device_description, uint64 registers_per_thread,
- uint64 shared_memory_per_block, const ThreadDim& thread_dims,
+ const DeviceDescription &device_description, uint64 registers_per_thread,
+ uint64 shared_memory_per_block, const ThreadDim &thread_dims,
CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
@@ -508,11 +508,11 @@ int CUDAExecutor::CalculateOccupancy(
// Compute and return the suggested thread count to acheive ideal occupancy.
// If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int* initial_blocks,
- const DeviceDescription& device_description,
+int CUDAExecutor::CompareOccupancy(int *initial_blocks,
+ const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
- const ThreadDim& thread_dims,
+ const ThreadDim &thread_dims,
CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 1481dcc19a..53b2a29ae7 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,16 +70,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
- int CalculateOccupancy(const DeviceDescription& device_description,
+ int CalculateOccupancy(const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func);
+ const ThreadDim &thread_dims, CUfunction func);
- int CompareOccupancy(int* initial_blocks,
- const DeviceDescription& device_description,
+ int CompareOccupancy(int *initial_blocks,
+ const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
- const ThreadDim& thread_dims, CUfunction func);
+ const ThreadDim &thread_dims, CUfunction func);
void *Allocate(uint64 size) override;