diff options
Diffstat (limited to 'tensorflow/core/kernels/conv_grad_ops_3d.cc')
-rw-r--r-- | tensorflow/core/kernels/conv_grad_ops_3d.cc | 115 |
1 files changed, 62 insertions, 53 deletions
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index cdb4b602ad..8ad56053a8 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -649,40 +649,45 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { }; using perftools::gputools::dnn::AlgorithmConfig; - using perftools::gputools::dnn::AlgorithmType; + using perftools::gputools::dnn::AlgorithmDesc; using perftools::gputools::dnn::ProfileResult; - using perftools::gputools::dnn::kDefaultAlgorithm; AlgorithmConfig algorithm_config; if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find( conv_parameters, &algorithm_config)) { - std::vector<AlgorithmType> algorithms; + std::vector<AlgorithmDesc::Index> algorithms; CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; - for (auto profile_algorithm : algorithms) { - // TODO(zhengxq): profile each algorithm multiple times to better - // accuracy. - CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, - context); - ProfileResult profile_result; - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardDataWithAlgorithm( - filter_desc, filter_ptr, output_desc, out_backprop_ptr, - conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator, - AlgorithmConfig(profile_algorithm), &profile_result) - .ok(); - if (cudnn_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - if (scratch_allocator.TotalByteSize() == 0 && - profile_result.elapsed_time_in_ms() < - best_result_no_scratch.elapsed_time_in_ms()) { - best_result_no_scratch = profile_result; + // TODO(benbarsdell): Ideally this should not attempt using tensor op math + // if it's not enabled. + for (bool use_tensor_ops : {false, true}) { + for (auto algo_index : algorithms) { + // TODO(zhengxq): profile each algorithm multiple times to better + // accuracy. + AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops); + CudnnScratchAllocator scratch_allocator( + ConvolveBackwardDataScratchSize, context); + ProfileResult profile_result; + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardDataWithAlgorithm( + filter_desc, filter_ptr, output_desc, out_backprop_ptr, + conv_desc, input_desc, &in_backprop_ptr, + &scratch_allocator, AlgorithmConfig(profile_algorithm), + &profile_result) + .ok(); + if (cudnn_launch_status) { + if (profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalByteSize() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_no_scratch.elapsed_time_in_ms()) { + best_result_no_scratch = profile_result; + } } } } @@ -1016,41 +1021,45 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { }; using perftools::gputools::dnn::AlgorithmConfig; - using perftools::gputools::dnn::AlgorithmType; + using perftools::gputools::dnn::AlgorithmDesc; using perftools::gputools::dnn::ProfileResult; - using perftools::gputools::dnn::kDefaultAlgorithm; AlgorithmConfig algorithm_config; if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find( conv_parameters, &algorithm_config)) { - std::vector<AlgorithmType> algorithms; + std::vector<AlgorithmDesc::Index> algorithms; CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms( conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; - for (auto profile_algorithm : algorithms) { - // TODO(zhengxq): profile each algorithm multiple times to better - // accuracy. - CudnnScratchAllocator scratch_allocator( - ConvolveBackwardFilterScratchSize, context); - ProfileResult profile_result; - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardFilterWithAlgorithm( - input_desc, input_ptr, output_desc, out_backprop_ptr, - conv_desc, filter_desc, &filter_backprop_ptr, - &scratch_allocator, AlgorithmConfig(profile_algorithm), - &profile_result) - .ok(); - if (cudnn_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - if (scratch_allocator.TotalByteSize() == 0 && - profile_result.elapsed_time_in_ms() < - best_result_no_scratch.elapsed_time_in_ms()) { - best_result_no_scratch = profile_result; + // TODO(benbarsdell): Ideally this should not attempt using tensor op math + // if it's not enabled. + for (bool use_tensor_ops : {false, true}) { + for (auto algo_index : algorithms) { + // TODO(zhengxq): profile each algorithm multiple times to better + // accuracy. + AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops); + CudnnScratchAllocator scratch_allocator( + ConvolveBackwardFilterScratchSize, context); + ProfileResult profile_result; + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardFilterWithAlgorithm( + input_desc, input_ptr, output_desc, out_backprop_ptr, + conv_desc, filter_desc, &filter_backprop_ptr, + &scratch_allocator, AlgorithmConfig(profile_algorithm), + &profile_result) + .ok(); + if (cudnn_launch_status) { + if (profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalByteSize() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_no_scratch.elapsed_time_in_ms()) { + best_result_no_scratch = profile_result; + } } } } |