diff options
Diffstat (limited to 'tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc')
-rw-r--r-- | tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index ecfe51d599..903aac5d68 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2) const int input_row_end = input_row_start + filter_rows; const int input_col_end = input_col_start + filter_cols; - T sum = 0; + T sum = static_cast<T>(0); const int input_offset_temp = in_rows * OB; if (input_row_start >= 0 && input_col_start >= 0 && @@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( __syncthreads(); if (depth_in_range) { - T sum1 = 0; - T sum2 = 0; + T sum1 = static_cast<T>(0); + T sum2 = static_cast<T>(0); int shared_offset = data_idx; const T* filter_ptr = filter_read_offset + shared_data; UNROLL for (int r = 0; r < filter_rows; ++r) { @@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2) const int input_row_end = input_row_start + filter_rows; const int input_col_end = input_col_start + filter_cols; - T sum = 0; + T sum = static_cast<T>(0); if (input_row_start >= 0 && input_col_start >= 0 && input_row_end < in_rows && input_col_end < in_cols) { // Loop that doesn't need to check for boundary conditions. @@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( __syncthreads(); if (slice_in_range) { - T sum1 = 0; - T sum2 = 0; + T sum1 = static_cast<T>(0); + T sum2 = static_cast<T>(0); int shared_offset = data_idx; const T* filter_ptr = filter_read_offset + shared_data; UNROLL for (int r = 0; r < filter_rows; ++r) { @@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx, "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed")); } +template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>; template struct LaunchDepthwiseConvOp<GPUDevice, float>; template struct LaunchDepthwiseConvOp<GPUDevice, double>; @@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2) const int in_r = (thread_id / in_depth / in_cols) % in_rows; const int b = thread_id / in_depth / in_cols / in_rows; - T sum = 0; + T sum = static_cast<T>(0); const int out_r_start = tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride); @@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2) const int in_d = (thread_id / in_cols / in_rows) % in_depth; const int b = thread_id / in_depth / in_cols / in_rows; - T sum = 0; + T sum = static_cast<T>(0); const int out_d_start = in_d * depth_multiplier; const int out_d_end = out_d_start + depth_multiplier; @@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()( "utGPULaunch failed")); } +template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>; template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>; template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>; @@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()( "terGPULaunch failed")); } +template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>; template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>; template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>; } // namespace tensorflow |