aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/convolution_thunk.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/convolution_thunk.cc324
1 files changed, 324 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
new file mode 100644
index 0000000000..30a92ab313
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -0,0 +1,324 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace gpu {
+
+using Index = BufferAllocation::Index;
+using se::dnn::BatchDescriptor;
+using se::dnn::ConvolutionDescriptor;
+using se::dnn::DataLayout;
+using se::dnn::FilterDescriptor;
+using se::dnn::FilterLayout;
+
+ConvolveScratchAllocator::ConvolveScratchAllocator(
+ int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+ : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+ConvolveScratchAllocator::~ConvolveScratchAllocator() {
+ for (auto& allocated_buffer : allocated_buffers_) {
+ if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
+ .ok()) {
+ // The program can still continue with failed deallocation.
+ LOG(ERROR) << "Failed to deallocate the allocated buffer: "
+ << allocated_buffer.opaque();
+ }
+ }
+}
+
+int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
+ constexpr int64 kConvolveScratchSize = 1LL << 32; // 4GB by default.
+ return kConvolveScratchSize;
+}
+
+se::port::StatusOr<se::DeviceMemory<uint8>>
+ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) {
+ CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+ if (byte_size > GetMemoryLimitInBytes(stream)) {
+ return se::port::Status(
+ se::port::error::RESOURCE_EXHAUSTED,
+ tensorflow::strings::Printf(
+ "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
+ byte_size, GetMemoryLimitInBytes(stream)));
+ }
+
+ auto status_or_memory =
+ memory_allocator_->Allocate(device_ordinal_, byte_size,
+ /*retry_on_failure=*/false);
+ if (!status_or_memory.ok()) {
+ return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
+ tensorflow::strings::Printf(
+ "Failed to allocate %lld bytes on device %d.",
+ byte_size, device_ordinal_));
+ }
+ se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
+ allocated_buffers_.push_back(allocated_buffer);
+ total_allocated_bytes_ += byte_size;
+ return se::DeviceMemory<uint8>(allocated_buffer);
+}
+
+string ConvolutionKindToString(
+ ConvolutionThunk::ConvolutionKind convolution_kind) {
+ switch (convolution_kind) {
+ case ConvolutionThunk::ConvolutionKind::kForward:
+ return "forward";
+ case ConvolutionThunk::ConvolutionKind::kBackwardFilter:
+ return "backward_filter";
+ case ConvolutionThunk::ConvolutionKind::kBackwardInput:
+ return "backward_input";
+ }
+}
+
+ConvolutionThunk::ConvolutionThunk(
+ ConvolutionKind convolution_kind, Index input_buffer, Index filter_buffer,
+ Index output_buffer, const Shape& input_shape, const Shape& filter_shape,
+ const Shape& output_shape, const Window& window,
+ const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo)
+ : Thunk(Kind::kConvolution, hlo),
+ convolution_kind_(convolution_kind),
+ input_buffer_(input_buffer),
+ filter_buffer_(filter_buffer),
+ output_buffer_(output_buffer),
+ input_shape_(input_shape),
+ filter_shape_(filter_shape),
+ output_shape_(output_shape),
+ window_(window),
+ dim_nums_(dim_nums) {}
+
+tensorflow::Status ConvolutionThunk::ExecuteOnStream(
+ const BufferAllocations& buffer_allocations, se::Stream* stream) {
+ VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_);
+ VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }";
+ VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }";
+ VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }";
+ VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }";
+ VLOG(3) << "Window: { " << window_.ShortDebugString() << " }";
+
+ CHECK_EQ(F32, output_shape_.element_type());
+ CHECK_EQ(2, window_.dimensions_size());
+ for (const WindowDimension& dim : window_.dimensions()) {
+ CHECK_EQ(dim.padding_low(), dim.padding_high());
+ }
+
+ const WindowDimension& height = window_.dimensions(0);
+ const WindowDimension& width = window_.dimensions(1);
+ // cuDNN's convolution APIs support the BDYX layout for activations/output and
+ // the OIYX layout for weights.
+ // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls
+ // when we switch to cuDNN v5.
+ BatchDescriptor input_descriptor;
+ input_descriptor.set_layout(DataLayout::kBatchDepthYX)
+ .set_height(input_shape_.dimensions(dim_nums_.spatial_dimensions(0)))
+ .set_width(input_shape_.dimensions(dim_nums_.spatial_dimensions(1)))
+ .set_feature_map_count(
+ input_shape_.dimensions(dim_nums_.feature_dimension()))
+ .set_count(input_shape_.dimensions(dim_nums_.batch_dimension()));
+
+ FilterDescriptor filter_descriptor;
+ filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
+ .set_input_feature_map_count(
+ filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension()))
+ .set_output_feature_map_count(
+ filter_shape_.dimensions(dim_nums_.kernel_output_feature_dimension()))
+ .set_input_filter_height(
+ filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(0)))
+ .set_input_filter_width(
+ filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(1)));
+
+ ConvolutionDescriptor convolution_descriptor;
+ convolution_descriptor.set_zero_padding_width(width.padding_low())
+ .set_zero_padding_height(height.padding_low())
+ .set_horizontal_filter_stride(width.stride())
+ .set_vertical_filter_stride(height.stride());
+
+ BatchDescriptor output_descriptor;
+ output_descriptor.set_layout(DataLayout::kBatchDepthYX)
+ .set_height(output_shape_.dimensions(dim_nums_.spatial_dimensions(0)))
+ .set_width(output_shape_.dimensions(dim_nums_.spatial_dimensions(1)))
+ .set_feature_map_count(
+ output_shape_.dimensions(dim_nums_.feature_dimension()))
+ .set_count(output_shape_.dimensions(dim_nums_.batch_dimension()));
+
+ se::DeviceMemory<float> input_data(
+ buffer_allocations.GetDeviceAddress(input_buffer_));
+ se::DeviceMemory<float> filter_data(
+ buffer_allocations.GetDeviceAddress(filter_buffer_));
+ se::DeviceMemory<float> output_data(
+ buffer_allocations.GetDeviceAddress(output_buffer_));
+ return ConvolveWithTune(input_descriptor, input_data, filter_descriptor,
+ filter_data, output_descriptor, output_data,
+ convolution_descriptor, buffer_allocations, stream);
+}
+
+tensorflow::Status ConvolutionThunk::Convolve(
+ const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
+ const FilterDescriptor& filter_descriptor,
+ se::DeviceMemory<float> filter_data,
+ const BatchDescriptor& output_descriptor,
+ se::DeviceMemory<float> output_data,
+ const ConvolutionDescriptor& convolution_descriptor,
+ const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream,
+ ConvolveScratchAllocator* scratch_allocator,
+ se::dnn::ProfileResult* profile_result) {
+ bool launch_ok;
+ switch (convolution_kind_) {
+ case ConvolutionKind::kBackwardFilter:
+ launch_ok =
+ stream
+ ->ThenConvolveBackwardFilterWithAlgorithm(
+ input_descriptor, input_data, output_descriptor, output_data,
+ convolution_descriptor, filter_descriptor, &filter_data,
+ scratch_allocator, algorithm_config, profile_result)
+ .ok();
+ break;
+ case ConvolutionKind::kBackwardInput:
+ launch_ok = stream
+ ->ThenConvolveBackwardDataWithAlgorithm(
+ filter_descriptor, filter_data, output_descriptor,
+ output_data, convolution_descriptor, input_descriptor,
+ &input_data, scratch_allocator, algorithm_config,
+ profile_result)
+ .ok();
+ break;
+ case ConvolutionKind::kForward:
+ launch_ok =
+ stream
+ ->ThenConvolveWithAlgorithm(
+ input_descriptor, input_data, filter_descriptor, filter_data,
+ convolution_descriptor, output_descriptor, &output_data,
+ scratch_allocator, algorithm_config, profile_result)
+ .ok();
+ break;
+ }
+ if (launch_ok) {
+ return tensorflow::Status::OK();
+ }
+ return InternalError(
+ "Unable to launch convolution for thunk %p with type %s and algorithm "
+ "(%lld, %lld)",
+ this, ConvolutionKindToString(convolution_kind_).c_str(),
+ algorithm_config.algorithm(), algorithm_config.algorithm_no_scratch());
+}
+
+std::vector<se::dnn::AlgorithmType> ConvolutionThunk::GetAlgorithms(
+ se::StreamExecutor* stream_exec) const {
+ std::vector<se::dnn::AlgorithmType> algorithms;
+ switch (convolution_kind_) {
+ case ConvolutionKind::kBackwardFilter:
+ CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(&algorithms));
+ break;
+ case ConvolutionKind::kBackwardInput:
+ CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(&algorithms));
+ break;
+ case ConvolutionKind::kForward:
+ CHECK(stream_exec->GetConvolveAlgorithms(&algorithms));
+ break;
+ }
+ return algorithms;
+}
+
+tensorflow::Status ConvolutionThunk::ConvolveWithTune(
+ const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
+ const FilterDescriptor& filter_descriptor,
+ se::DeviceMemory<float> filter_data,
+ const BatchDescriptor& output_descriptor,
+ se::DeviceMemory<float> output_data,
+ const ConvolutionDescriptor& convolution_descriptor,
+ const BufferAllocations& buffer_allocations, se::Stream* stream) {
+ // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
+ legacy_flags::ConvolutionThunkFlags* flags =
+ legacy_flags::GetConvolutionThunkFlags();
+ if (flags->xla_gpu_autotune_convolution_algorithm &&
+ best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) {
+ // Auto-tuning either is disabled or only happens in the first run of this
+ // function.
+ VLOG(2) << "Profiling for best convolution algorithm used for "
+ "ConvolutionThunk: "
+ << this;
+
+ se::dnn::ProfileResult best_result;
+ se::dnn::ProfileResult best_result_without_scratch;
+ for (se::dnn::AlgorithmType algorithm : GetAlgorithms(stream->parent())) {
+ ConvolveScratchAllocator scratch_allocator(
+ buffer_allocations.device_ordinal(),
+ buffer_allocations.memory_allocator());
+ se::dnn::ProfileResult profile_result;
+ bool launch_ok =
+ Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
+ output_descriptor, output_data, convolution_descriptor,
+ se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
+ &scratch_allocator, &profile_result)
+ .ok();
+ if (launch_ok && profile_result.is_valid()) {
+ if (profile_result.elapsed_time_in_ms() <
+ best_result.elapsed_time_in_ms()) {
+ best_result = profile_result;
+ }
+ if (scratch_allocator.TotalAllocatedBytes() == 0 &&
+ profile_result.elapsed_time_in_ms() <
+ best_result_without_scratch.elapsed_time_in_ms()) {
+ best_result_without_scratch = profile_result;
+ }
+ }
+ }
+
+ if (best_result.is_valid()) {
+ best_algorithm_.set_algorithm(best_result.algorithm());
+ } else {
+ LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
+ "to the default algorithm.";
+ best_algorithm_.set_algorithm(se::dnn::kDefaultAlgorithm);
+ }
+
+ if (best_result_without_scratch.is_valid()) {
+ best_algorithm_.set_algorithm_no_scratch(
+ best_result_without_scratch.algorithm());
+ } else {
+ LOG(ERROR) << "No convolution algorithm without scratch works with "
+ "profiling. Fall back "
+ "to the default algorithm.";
+ best_algorithm_.set_algorithm_no_scratch(se::dnn::kDefaultAlgorithm);
+ }
+ }
+
+ {
+ VLOG(2) << "Using convolution algorithm (" << best_algorithm_.algorithm()
+ << ", " << best_algorithm_.algorithm_no_scratch()
+ << ") for ConvolutionThunk: " << this;
+ ConvolveScratchAllocator scratch_allocator(
+ buffer_allocations.device_ordinal(),
+ buffer_allocations.memory_allocator());
+ return Convolve(input_descriptor, input_data, filter_descriptor,
+ filter_data, output_descriptor, output_data,
+ convolution_descriptor, best_algorithm_, stream,
+ &scratch_allocator, nullptr);
+ }
+}
+
+} // namespace gpu
+} // namespace xla