diff options
author | 2018-07-04 15:55:59 -0700 | |
---|---|---|
committer | 2018-07-04 15:58:20 -0700 | |
commit | eb8a110e52a8e8ed7b9db48e01115d3110e918c2 (patch) | |
tree | ff984481a05efb9efd0c78c0c7065b71d268c8e6 /tensorflow/compiler/xla/service/llvm_ir | |
parent | 548601aee34f3eb0e6857b5ce6df25db1b50a4b4 (diff) |
[XLA:GPU] Enhance the tiled 0-2-1 transpose algorithm to handle fusion.
Add class TiledParameterInfo to provide information for FusedIrEmitter to read
the content of a tiled parameter from the tile buffer instead of the original
input memory.
Reimplement the tiled 0-2-1 transpose algorithm for copy instructions only in a
more general way so that it can handle both fusion instructions and copy
instructions.
The original tiled 0-2-1 transpose implementation incorrectly used
(tile_size+1) rows for a tile buffer to reduce share memory bank conflicts while
it should be (tile_size+1) column instead. This is a performance issue and is
fixed in the new implementation.
The original tiled 0-2-1 transpose implementation did not generate LLVM alias
meta data for the loads and stores of the tensors. This was due to a bug where
function IrArray::CastToShape miss copying meta data to the new IrArray
object. This is also a performance issue and is fixed in this change.
Modified KernelSupportLibrary to support emitting an if-stmt with a given
branch name prefix.
Add test cases to test the new implementation.
PiperOrigin-RevId: 203310403
Diffstat (limited to 'tensorflow/compiler/xla/service/llvm_ir')
9 files changed, 281 insertions, 6 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index ce36afc1e6..b02bed92e9 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -107,11 +107,30 @@ cc_library( ) cc_library( + name = "kernel_tiling", + srcs = ["kernel_tiling.cc"], + hdrs = ["kernel_tiling.h"], + deps = [ + ":ir_array", + ":llvm_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:lib", + "@llvm//:core", + ], +) + +cc_library( name = "fused_ir_emitter", srcs = ["fused_ir_emitter.cc"], hdrs = ["fused_ir_emitter.h"], deps = [ ":ir_array", + ":kernel_tiling", ":llvm_util", ":loop_emitter", ":tuple_ops", diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc index d909845a3a..21160a770f 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc @@ -119,7 +119,23 @@ Status FusedIrEmitter::HandleGetTupleElement( } Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) { - generators_[parameter] = [=](const IrArray::Index& index) { + generators_[parameter] = [=](const IrArray::Index& index) -> llvm::Value* { + if (tiled_parameter_info_) { + llvm::Value* param_buffer = tiled_parameter_info_->GetBufferForParameter( + parameter->parameter_number()); + if (param_buffer) { + VLOG(3) << "Use buffer for " << parameter->ToString(); + llvm::Instruction* load_from_buffer = ir_builder_->CreateLoad( + ir_builder_->CreateGEP( + param_buffer, + {index.GetConstantWithIndexType(0), tiled_parameter_info_->x(), + tiled_parameter_info_->y()}), + "tiled_buffer"); + parameter_arrays_[parameter->parameter_number()] + .AnnotateBufferLoadStoreInstructionWithMetadata(load_from_buffer); + return load_from_buffer; + } + } return parameter_arrays_[parameter->parameter_number()] .EmitReadArrayElement(index, ir_builder_); }; diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h index b3b6026ef1..a6ceec7b23 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -56,6 +57,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault { FusedIrEmitter(tensorflow::gtl::ArraySlice<llvm_ir::IrArray> parameter_arrays, ElementalIrEmitter* elemental_emitter) : parameter_arrays_(parameter_arrays), + tiled_parameter_info_(nullptr), elemental_emitter_(elemental_emitter), ir_builder_(elemental_emitter->ir_builder()), module_(elemental_emitter->module()) {} @@ -86,9 +88,14 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault { return it->second; } + void SetTiledParameterInfo(const llvm_ir::TiledParameterInfo* info) { + tiled_parameter_info_ = info; + } + private: // Arrays of parameters of fusion instruction tensorflow::gtl::ArraySlice<llvm_ir::IrArray> parameter_arrays_; + const llvm_ir::TiledParameterInfo* tiled_parameter_info_; ElementalIrEmitter* elemental_emitter_; diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc index ea10cef49a..f389cc283f 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc @@ -401,6 +401,19 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata( } } +void IrArray::AnnotateBufferLoadStoreInstructionWithMetadata( + llvm::Instruction* instruction) const { + CHECK(llvm::isa<llvm::LoadInst>(instruction) || + llvm::isa<llvm::StoreInst>(instruction)); + CHECK(is_invariant_) << "IrArray for a parameter is not marked as invariant."; + + for (const auto& kind_md_pair : metadata_) { + if (kind_md_pair.first != llvm::LLVMContext::MD_invariant_load) { + instruction->setMetadata(kind_md_pair.first, kind_md_pair.second); + } + } +} + llvm::Value* IrArray::EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name) const { @@ -422,9 +435,11 @@ IrArray IrArray::CastToShape(const Shape& new_shape, llvm::IRBuilder<>* ir_builder) const { llvm::Module* module = ir_builder->GetInsertBlock()->getParent()->getParent(); llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module); - return IrArray( + IrArray new_irarray( ir_builder->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), new_shape); + new_irarray.metadata_ = metadata_; + return new_irarray; } /* static */ IrArray::Index IrArray::BumpIndex(const Index& index, diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h index 4648c6d7ac..fb1f01ab6b 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h @@ -248,6 +248,11 @@ class IrArray { void AnnotateLoadStoreInstructionWithMetadata( llvm::Instruction* instruction) const; + // Uses the metadata for a parameter IrArray to annotate the load/store of the + // tile buffer for the parameter. + void AnnotateBufferLoadStoreInstructionWithMetadata( + llvm::Instruction* instruction) const; + // Emit IR to read an array element at the given index. Returns the read // result (effectively, a Value loaded from memory). This method seamlessly // handles scalar shapes by broadcasting their value to all indices (index is diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc index 1f6e3c829f..98d0ceb3e2 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc @@ -56,10 +56,11 @@ Status KernelSupportLibrary::For( } Status KernelSupportLibrary::If( - llvm::Value* condition, const std::function<Status()>& true_block_generator, + tensorflow::StringPiece name, llvm::Value* condition, + const std::function<Status()>& true_block_generator, const std::function<Status()>& false_block_generator) { llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(condition, "", ir_builder_); + llvm_ir::EmitIfThenElse(condition, name, ir_builder_); ir_builder_->SetInsertPoint(&if_data.true_block->back()); TF_RETURN_IF_ERROR(true_block_generator()); ir_builder_->SetInsertPoint(&if_data.false_block->back()); diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h index 6f7a9d94e3..9d770cc4c3 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h @@ -203,16 +203,30 @@ class KernelSupportLibrary { // `true_block_generator()`; // else // `false_block_generator()`; - Status If(llvm::Value* condition, + Status If(tensorflow::StringPiece name, llvm::Value* condition, const std::function<Status()>& true_block_generator, const std::function<Status()>& false_block_generator = []() -> Status { return Status::OK(); }); + Status If(llvm::Value* condition, + const std::function<Status()>& true_block_generator, + const std::function<Status()>& false_block_generator = + []() -> Status { return Status::OK(); }) { + return If("", condition, true_block_generator, false_block_generator); + } + void IfReturnVoid(llvm::Value* condition, const std::function<void()>& true_block_generator, const std::function<void()>& false_block_generator = []() { }) { - TF_CHECK_OK(If(condition, + IfReturnVoid("", condition, true_block_generator, false_block_generator); + } + + void IfReturnVoid(tensorflow::StringPiece name, llvm::Value* condition, + const std::function<void()>& true_block_generator, + const std::function<void()>& false_block_generator = []() { + }) { + TF_CHECK_OK(If(name, condition, [&]() { true_block_generator(); return Status::OK(); diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc new file mode 100644 index 0000000000..533b75cdae --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc @@ -0,0 +1,118 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { +namespace llvm_ir { + +namespace { +// Returns the indices of the first elements of all consecutive subarrays of the +// given array. For example: +// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4} +std::vector<size_t> ConsecutiveSegments(tensorflow::gtl::ArraySlice<int64> xs) { + std::vector<size_t> is = {0}; + for (size_t i = 1; i < xs.size(); ++i) { + if (1 != xs[i] - xs[i - 1]) { + is.push_back(i); + } + } + return is; +} + +// Merges the sequences of dimensions of the given shape which start at the +// given indices `segs`. +Shape MergeDimensions(tensorflow::gtl::ArraySlice<size_t> segs, + const Shape& shape) { + std::vector<int64> dimensions; + for (size_t i = 1; i <= segs.size(); ++i) { + dimensions.push_back(std::accumulate( + shape.dimensions().begin() + segs[i - 1], + shape.dimensions().begin() + + (segs.size() == i ? shape.dimensions().size() : segs[i]), + 1, std::multiplies<int64>())); + } + return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(), + dimensions); +} +} // namespace + +tensorflow::gtl::optional<std::vector<int64> > FindTranspose021( + const Shape& a, const Shape& b) { + if (!ShapeUtil::CompatibleIgnoringElementType(a, b)) { + return tensorflow::gtl::nullopt; + } + + std::vector<int64> perm(a.dimensions().size()); + { + auto layout_a_orig = LayoutUtil::MinorToMajor(a); + std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend()); + auto layout_b_orig = LayoutUtil::MinorToMajor(b); + std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend()); + for (size_t i = 0; i < perm.size(); ++i) { + perm[i] = PositionInContainer(layout_b, layout_a[i]); + } + } + auto segs = ConsecutiveSegments(perm); + if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) { + Shape norm_a = + ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a); + Shape reduced_a = MergeDimensions(segs, norm_a); + auto reduced_a_dims = reduced_a.dimensions(); + std::vector<int64> dims_021; + if (2 == segs.size()) { + // The logical component-0 is of size one. + dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]}; + } else { + dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]}; + } + + return dims_021; + } + + return tensorflow::gtl::nullopt; +} + +IrArray::Index GetUnreducedOutputIndex( + const IrArray::Index& reduced_output_index, + const Shape& reduced_output_shape, const Shape& unreduced_output_shape, + llvm::IRBuilder<>* ir_builder) { + auto bounds = reduced_output_shape.dimensions(); + auto minor_to_major = reduced_output_shape.layout().minor_to_major(); + llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0); + int64 multiplier = 1; + for (int i = 0; i < reduced_output_index.size(); ++i) { + int64 dim = minor_to_major[i]; + llvm::Value* addend = ir_builder->CreateMul( + reduced_output_index[dim], + reduced_output_index.GetConstantWithIndexType(multiplier), + "linearizing", + /*HasNUW=*/true, /*HasNSW=*/true); + linear_index = ir_builder->CreateAdd(linear_index, addend, "", + /*HasNUW=*/true, /*HasNSW=*/true); + multiplier *= bounds[dim]; + } + + return IrArray::Index(linear_index, unreduced_output_shape, ir_builder); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h new file mode 100644 index 0000000000..6f1268fffb --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h @@ -0,0 +1,80 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ + +#include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" + +namespace xla { +namespace llvm_ir { + +// About 0-2-1 transpose: +// +// If a shape can be viewed as three logical components 0-1-2 in the order of +// major to minor, a 0-2-1-transpose changes the order of such logical +// components to 0-2-1. We call the shape being transposed the input shape and +// the transposed shape the output shape. The logical view of the input and +// output shapes for the transpose are called the 0-1-2 shape or reduced input +// shape and the 0-2-1 shape or the reduced output shape respectively. The +// original input and output shapes are called the unreduced input and output +// shapes. + +// If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the +// reduced shape of `b` or the 0-2-1 shape. +tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(const Shape& a, + const Shape& b); + +// Return the unreduced output index corresponding to the given reduced output +// index. +IrArray::Index GetUnreducedOutputIndex( + const IrArray::Index& reduced_output_index, + const Shape& reduced_output_shape, const Shape& unreduced_output_shape, + llvm::IRBuilder<>* ir_builder); + +// A class to represent information for tiled parameters to support IR emission +// for 021 transpose. +class TiledParameterInfo { + public: + TiledParameterInfo(tensorflow::gtl::ArraySlice<llvm::Value*> param_buffers, + llvm::Value* y, llvm::Value* x) + : param_buffers_(param_buffers), y_(y), x_(x) {} + + llvm::Value* x() const { return x_; } + llvm::Value* y() const { return y_; } + + void set_x(llvm::Value* x) { x_ = x; } + void set_y(llvm::Value* y) { y_ = y; } + + llvm::Value* GetBufferForParameter(int64 index) const { + return param_buffers_[index]; + } + + private: + // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr + // if the parameter is not tiled. + tensorflow::gtl::ArraySlice<llvm::Value*> param_buffers_; + // The y coordinate within a tile. + llvm::Value* y_; + // The x coordinate within a tile. + llvm::Value* x_; +}; + +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ |