diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/convolution_thunk.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/convolution_thunk.cc | 324 |
1 files changed, 324 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc new file mode 100644 index 0000000000..30a92ab313 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -0,0 +1,324 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" + +#include <string> + +#include "tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace se = ::perftools::gputools; + +namespace xla { +namespace gpu { + +using Index = BufferAllocation::Index; +using se::dnn::BatchDescriptor; +using se::dnn::ConvolutionDescriptor; +using se::dnn::DataLayout; +using se::dnn::FilterDescriptor; +using se::dnn::FilterLayout; + +ConvolveScratchAllocator::ConvolveScratchAllocator( + int device_ordinal, DeviceMemoryAllocator* memory_allocator) + : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} + +ConvolveScratchAllocator::~ConvolveScratchAllocator() { + for (auto& allocated_buffer : allocated_buffers_) { + if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer) + .ok()) { + // The program can still continue with failed deallocation. + LOG(ERROR) << "Failed to deallocate the allocated buffer: " + << allocated_buffer.opaque(); + } + } +} + +int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) { + constexpr int64 kConvolveScratchSize = 1LL << 32; // 4GB by default. + return kConvolveScratchSize; +} + +se::port::StatusOr<se::DeviceMemory<uint8>> +ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) { + CHECK_GE(byte_size, 0) << "byte_size must be positive."; + if (byte_size > GetMemoryLimitInBytes(stream)) { + return se::port::Status( + se::port::error::RESOURCE_EXHAUSTED, + tensorflow::strings::Printf( + "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + byte_size, GetMemoryLimitInBytes(stream))); + } + + auto status_or_memory = + memory_allocator_->Allocate(device_ordinal_, byte_size, + /*retry_on_failure=*/false); + if (!status_or_memory.ok()) { + return se::port::Status(se::port::error::RESOURCE_EXHAUSTED, + tensorflow::strings::Printf( + "Failed to allocate %lld bytes on device %d.", + byte_size, device_ordinal_)); + } + se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie(); + allocated_buffers_.push_back(allocated_buffer); + total_allocated_bytes_ += byte_size; + return se::DeviceMemory<uint8>(allocated_buffer); +} + +string ConvolutionKindToString( + ConvolutionThunk::ConvolutionKind convolution_kind) { + switch (convolution_kind) { + case ConvolutionThunk::ConvolutionKind::kForward: + return "forward"; + case ConvolutionThunk::ConvolutionKind::kBackwardFilter: + return "backward_filter"; + case ConvolutionThunk::ConvolutionKind::kBackwardInput: + return "backward_input"; + } +} + +ConvolutionThunk::ConvolutionThunk( + ConvolutionKind convolution_kind, Index input_buffer, Index filter_buffer, + Index output_buffer, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, const Window& window, + const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo) + : Thunk(Kind::kConvolution, hlo), + convolution_kind_(convolution_kind), + input_buffer_(input_buffer), + filter_buffer_(filter_buffer), + output_buffer_(output_buffer), + input_shape_(input_shape), + filter_shape_(filter_shape), + output_shape_(output_shape), + window_(window), + dim_nums_(dim_nums) {} + +tensorflow::Status ConvolutionThunk::ExecuteOnStream( + const BufferAllocations& buffer_allocations, se::Stream* stream) { + VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_); + VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }"; + VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }"; + VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }"; + VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }"; + VLOG(3) << "Window: { " << window_.ShortDebugString() << " }"; + + CHECK_EQ(F32, output_shape_.element_type()); + CHECK_EQ(2, window_.dimensions_size()); + for (const WindowDimension& dim : window_.dimensions()) { + CHECK_EQ(dim.padding_low(), dim.padding_high()); + } + + const WindowDimension& height = window_.dimensions(0); + const WindowDimension& width = window_.dimensions(1); + // cuDNN's convolution APIs support the BDYX layout for activations/output and + // the OIYX layout for weights. + // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls + // when we switch to cuDNN v5. + BatchDescriptor input_descriptor; + input_descriptor.set_layout(DataLayout::kBatchDepthYX) + .set_height(input_shape_.dimensions(dim_nums_.spatial_dimensions(0))) + .set_width(input_shape_.dimensions(dim_nums_.spatial_dimensions(1))) + .set_feature_map_count( + input_shape_.dimensions(dim_nums_.feature_dimension())) + .set_count(input_shape_.dimensions(dim_nums_.batch_dimension())); + + FilterDescriptor filter_descriptor; + filter_descriptor.set_layout(FilterLayout::kOutputInputYX) + .set_input_feature_map_count( + filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension())) + .set_output_feature_map_count( + filter_shape_.dimensions(dim_nums_.kernel_output_feature_dimension())) + .set_input_filter_height( + filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(0))) + .set_input_filter_width( + filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(1))); + + ConvolutionDescriptor convolution_descriptor; + convolution_descriptor.set_zero_padding_width(width.padding_low()) + .set_zero_padding_height(height.padding_low()) + .set_horizontal_filter_stride(width.stride()) + .set_vertical_filter_stride(height.stride()); + + BatchDescriptor output_descriptor; + output_descriptor.set_layout(DataLayout::kBatchDepthYX) + .set_height(output_shape_.dimensions(dim_nums_.spatial_dimensions(0))) + .set_width(output_shape_.dimensions(dim_nums_.spatial_dimensions(1))) + .set_feature_map_count( + output_shape_.dimensions(dim_nums_.feature_dimension())) + .set_count(output_shape_.dimensions(dim_nums_.batch_dimension())); + + se::DeviceMemory<float> input_data( + buffer_allocations.GetDeviceAddress(input_buffer_)); + se::DeviceMemory<float> filter_data( + buffer_allocations.GetDeviceAddress(filter_buffer_)); + se::DeviceMemory<float> output_data( + buffer_allocations.GetDeviceAddress(output_buffer_)); + return ConvolveWithTune(input_descriptor, input_data, filter_descriptor, + filter_data, output_descriptor, output_data, + convolution_descriptor, buffer_allocations, stream); +} + +tensorflow::Status ConvolutionThunk::Convolve( + const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data, + const FilterDescriptor& filter_descriptor, + se::DeviceMemory<float> filter_data, + const BatchDescriptor& output_descriptor, + se::DeviceMemory<float> output_data, + const ConvolutionDescriptor& convolution_descriptor, + const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream, + ConvolveScratchAllocator* scratch_allocator, + se::dnn::ProfileResult* profile_result) { + bool launch_ok; + switch (convolution_kind_) { + case ConvolutionKind::kBackwardFilter: + launch_ok = + stream + ->ThenConvolveBackwardFilterWithAlgorithm( + input_descriptor, input_data, output_descriptor, output_data, + convolution_descriptor, filter_descriptor, &filter_data, + scratch_allocator, algorithm_config, profile_result) + .ok(); + break; + case ConvolutionKind::kBackwardInput: + launch_ok = stream + ->ThenConvolveBackwardDataWithAlgorithm( + filter_descriptor, filter_data, output_descriptor, + output_data, convolution_descriptor, input_descriptor, + &input_data, scratch_allocator, algorithm_config, + profile_result) + .ok(); + break; + case ConvolutionKind::kForward: + launch_ok = + stream + ->ThenConvolveWithAlgorithm( + input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, &output_data, + scratch_allocator, algorithm_config, profile_result) + .ok(); + break; + } + if (launch_ok) { + return tensorflow::Status::OK(); + } + return InternalError( + "Unable to launch convolution for thunk %p with type %s and algorithm " + "(%lld, %lld)", + this, ConvolutionKindToString(convolution_kind_).c_str(), + algorithm_config.algorithm(), algorithm_config.algorithm_no_scratch()); +} + +std::vector<se::dnn::AlgorithmType> ConvolutionThunk::GetAlgorithms( + se::StreamExecutor* stream_exec) const { + std::vector<se::dnn::AlgorithmType> algorithms; + switch (convolution_kind_) { + case ConvolutionKind::kBackwardFilter: + CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(&algorithms)); + break; + case ConvolutionKind::kBackwardInput: + CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(&algorithms)); + break; + case ConvolutionKind::kForward: + CHECK(stream_exec->GetConvolveAlgorithms(&algorithms)); + break; + } + return algorithms; +} + +tensorflow::Status ConvolutionThunk::ConvolveWithTune( + const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data, + const FilterDescriptor& filter_descriptor, + se::DeviceMemory<float> filter_data, + const BatchDescriptor& output_descriptor, + se::DeviceMemory<float> output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BufferAllocations& buffer_allocations, se::Stream* stream) { + // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out. + legacy_flags::ConvolutionThunkFlags* flags = + legacy_flags::GetConvolutionThunkFlags(); + if (flags->xla_gpu_autotune_convolution_algorithm && + best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) { + // Auto-tuning either is disabled or only happens in the first run of this + // function. + VLOG(2) << "Profiling for best convolution algorithm used for " + "ConvolutionThunk: " + << this; + + se::dnn::ProfileResult best_result; + se::dnn::ProfileResult best_result_without_scratch; + for (se::dnn::AlgorithmType algorithm : GetAlgorithms(stream->parent())) { + ConvolveScratchAllocator scratch_allocator( + buffer_allocations.device_ordinal(), + buffer_allocations.memory_allocator()); + se::dnn::ProfileResult profile_result; + bool launch_ok = + Convolve(input_descriptor, input_data, filter_descriptor, filter_data, + output_descriptor, output_data, convolution_descriptor, + se::dnn::AlgorithmConfig(algorithm, algorithm), stream, + &scratch_allocator, &profile_result) + .ok(); + if (launch_ok && profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalAllocatedBytes() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_without_scratch.elapsed_time_in_ms()) { + best_result_without_scratch = profile_result; + } + } + } + + if (best_result.is_valid()) { + best_algorithm_.set_algorithm(best_result.algorithm()); + } else { + LOG(ERROR) << "No convolution algorithm works with profiling. Fall back " + "to the default algorithm."; + best_algorithm_.set_algorithm(se::dnn::kDefaultAlgorithm); + } + + if (best_result_without_scratch.is_valid()) { + best_algorithm_.set_algorithm_no_scratch( + best_result_without_scratch.algorithm()); + } else { + LOG(ERROR) << "No convolution algorithm without scratch works with " + "profiling. Fall back " + "to the default algorithm."; + best_algorithm_.set_algorithm_no_scratch(se::dnn::kDefaultAlgorithm); + } + } + + { + VLOG(2) << "Using convolution algorithm (" << best_algorithm_.algorithm() + << ", " << best_algorithm_.algorithm_no_scratch() + << ") for ConvolutionThunk: " << this; + ConvolveScratchAllocator scratch_allocator( + buffer_allocations.device_ordinal(), + buffer_allocations.memory_allocator()); + return Convolve(input_descriptor, input_data, filter_descriptor, + filter_data, output_descriptor, output_data, + convolution_descriptor, best_algorithm_, stream, + &scratch_allocator, nullptr); + } +} + +} // namespace gpu +} // namespace xla |