aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc')
-rw-r--r--tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc19
1 files changed, 11 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index ecfe51d599..903aac5d68 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2)
const int input_row_end = input_row_start + filter_rows;
const int input_col_end = input_col_start + filter_cols;
- T sum = 0;
+ T sum = static_cast<T>(0);
const int input_offset_temp = in_rows * OB;
if (input_row_start >= 0 && input_col_start >= 0 &&
@@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
__syncthreads();
if (depth_in_range) {
- T sum1 = 0;
- T sum2 = 0;
+ T sum1 = static_cast<T>(0);
+ T sum2 = static_cast<T>(0);
int shared_offset = data_idx;
const T* filter_ptr = filter_read_offset + shared_data;
UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2)
const int input_row_end = input_row_start + filter_rows;
const int input_col_end = input_col_start + filter_cols;
- T sum = 0;
+ T sum = static_cast<T>(0);
if (input_row_start >= 0 && input_col_start >= 0 &&
input_row_end < in_rows && input_col_end < in_cols) {
// Loop that doesn't need to check for boundary conditions.
@@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
__syncthreads();
if (slice_in_range) {
- T sum1 = 0;
- T sum2 = 0;
+ T sum1 = static_cast<T>(0);
+ T sum2 = static_cast<T>(0);
int shared_offset = data_idx;
const T* filter_ptr = filter_read_offset + shared_data;
UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
"Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
}
+template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvOp<GPUDevice, float>;
template struct LaunchDepthwiseConvOp<GPUDevice, double>;
@@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2)
const int in_r = (thread_id / in_depth / in_cols) % in_rows;
const int b = thread_id / in_depth / in_cols / in_rows;
- T sum = 0;
+ T sum = static_cast<T>(0);
const int out_r_start =
tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
@@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2)
const int in_d = (thread_id / in_cols / in_rows) % in_depth;
const int b = thread_id / in_depth / in_cols / in_rows;
- T sum = 0;
+ T sum = static_cast<T>(0);
const int out_d_start = in_d * depth_multiplier;
const int out_d_end = out_d_start + depth_multiplier;
@@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
"utGPULaunch failed"));
}
+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
@@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
"terGPULaunch failed"));
}
+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
} // namespace tensorflow