aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/llvm_ir
diff options
context:
space:
mode:
authorGravatar Bixia Zheng <bixia@google.com>2018-07-04 15:55:59 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-04 15:58:20 -0700
commiteb8a110e52a8e8ed7b9db48e01115d3110e918c2 (patch)
treeff984481a05efb9efd0c78c0c7065b71d268c8e6 /tensorflow/compiler/xla/service/llvm_ir
parent548601aee34f3eb0e6857b5ce6df25db1b50a4b4 (diff)
[XLA:GPU] Enhance the tiled 0-2-1 transpose algorithm to handle fusion.
Add class TiledParameterInfo to provide information for FusedIrEmitter to read the content of a tiled parameter from the tile buffer instead of the original input memory. Reimplement the tiled 0-2-1 transpose algorithm for copy instructions only in a more general way so that it can handle both fusion instructions and copy instructions. The original tiled 0-2-1 transpose implementation incorrectly used (tile_size+1) rows for a tile buffer to reduce share memory bank conflicts while it should be (tile_size+1) column instead. This is a performance issue and is fixed in the new implementation. The original tiled 0-2-1 transpose implementation did not generate LLVM alias meta data for the loads and stores of the tensors. This was due to a bug where function IrArray::CastToShape miss copying meta data to the new IrArray object. This is also a performance issue and is fixed in this change. Modified KernelSupportLibrary to support emitting an if-stmt with a given branch name prefix. Add test cases to test the new implementation. PiperOrigin-RevId: 203310403
Diffstat (limited to 'tensorflow/compiler/xla/service/llvm_ir')
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/BUILD19
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc18
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h7
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/ir_array.cc17
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/ir_array.h5
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc5
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h18
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc118
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h80
9 files changed, 281 insertions, 6 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index ce36afc1e6..b02bed92e9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -107,11 +107,30 @@ cc_library(
)
cc_library(
+ name = "kernel_tiling",
+ srcs = ["kernel_tiling.cc"],
+ hdrs = ["kernel_tiling.h"],
+ deps = [
+ ":ir_array",
+ ":llvm_util",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:statusor",
+ "//tensorflow/compiler/xla:types",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/core:lib",
+ "@llvm//:core",
+ ],
+)
+
+cc_library(
name = "fused_ir_emitter",
srcs = ["fused_ir_emitter.cc"],
hdrs = ["fused_ir_emitter.h"],
deps = [
":ir_array",
+ ":kernel_tiling",
":llvm_util",
":loop_emitter",
":tuple_ops",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index d909845a3a..21160a770f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -119,7 +119,23 @@ Status FusedIrEmitter::HandleGetTupleElement(
}
Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
- generators_[parameter] = [=](const IrArray::Index& index) {
+ generators_[parameter] = [=](const IrArray::Index& index) -> llvm::Value* {
+ if (tiled_parameter_info_) {
+ llvm::Value* param_buffer = tiled_parameter_info_->GetBufferForParameter(
+ parameter->parameter_number());
+ if (param_buffer) {
+ VLOG(3) << "Use buffer for " << parameter->ToString();
+ llvm::Instruction* load_from_buffer = ir_builder_->CreateLoad(
+ ir_builder_->CreateGEP(
+ param_buffer,
+ {index.GetConstantWithIndexType(0), tiled_parameter_info_->x(),
+ tiled_parameter_info_->y()}),
+ "tiled_buffer");
+ parameter_arrays_[parameter->parameter_number()]
+ .AnnotateBufferLoadStoreInstructionWithMetadata(load_from_buffer);
+ return load_from_buffer;
+ }
+ }
return parameter_arrays_[parameter->parameter_number()]
.EmitReadArrayElement(index, ir_builder_);
};
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index b3b6026ef1..a6ceec7b23 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -25,6 +25,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -56,6 +57,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
FusedIrEmitter(tensorflow::gtl::ArraySlice<llvm_ir::IrArray> parameter_arrays,
ElementalIrEmitter* elemental_emitter)
: parameter_arrays_(parameter_arrays),
+ tiled_parameter_info_(nullptr),
elemental_emitter_(elemental_emitter),
ir_builder_(elemental_emitter->ir_builder()),
module_(elemental_emitter->module()) {}
@@ -86,9 +88,14 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
return it->second;
}
+ void SetTiledParameterInfo(const llvm_ir::TiledParameterInfo* info) {
+ tiled_parameter_info_ = info;
+ }
+
private:
// Arrays of parameters of fusion instruction
tensorflow::gtl::ArraySlice<llvm_ir::IrArray> parameter_arrays_;
+ const llvm_ir::TiledParameterInfo* tiled_parameter_info_;
ElementalIrEmitter* elemental_emitter_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index ea10cef49a..f389cc283f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -401,6 +401,19 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
}
}
+void IrArray::AnnotateBufferLoadStoreInstructionWithMetadata(
+ llvm::Instruction* instruction) const {
+ CHECK(llvm::isa<llvm::LoadInst>(instruction) ||
+ llvm::isa<llvm::StoreInst>(instruction));
+ CHECK(is_invariant_) << "IrArray for a parameter is not marked as invariant.";
+
+ for (const auto& kind_md_pair : metadata_) {
+ if (kind_md_pair.first != llvm::LLVMContext::MD_invariant_load) {
+ instruction->setMetadata(kind_md_pair.first, kind_md_pair.second);
+ }
+ }
+}
+
llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
llvm::IRBuilder<>* ir_builder,
tensorflow::StringPiece name) const {
@@ -422,9 +435,11 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
llvm::IRBuilder<>* ir_builder) const {
llvm::Module* module = ir_builder->GetInsertBlock()->getParent()->getParent();
llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module);
- return IrArray(
+ IrArray new_irarray(
ir_builder->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()),
new_shape);
+ new_irarray.metadata_ = metadata_;
+ return new_irarray;
}
/* static */ IrArray::Index IrArray::BumpIndex(const Index& index,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 4648c6d7ac..fb1f01ab6b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -248,6 +248,11 @@ class IrArray {
void AnnotateLoadStoreInstructionWithMetadata(
llvm::Instruction* instruction) const;
+ // Uses the metadata for a parameter IrArray to annotate the load/store of the
+ // tile buffer for the parameter.
+ void AnnotateBufferLoadStoreInstructionWithMetadata(
+ llvm::Instruction* instruction) const;
+
// Emit IR to read an array element at the given index. Returns the read
// result (effectively, a Value loaded from memory). This method seamlessly
// handles scalar shapes by broadcasting their value to all indices (index is
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 1f6e3c829f..98d0ceb3e2 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -56,10 +56,11 @@ Status KernelSupportLibrary::For(
}
Status KernelSupportLibrary::If(
- llvm::Value* condition, const std::function<Status()>& true_block_generator,
+ tensorflow::StringPiece name, llvm::Value* condition,
+ const std::function<Status()>& true_block_generator,
const std::function<Status()>& false_block_generator) {
llvm_ir::LlvmIfData if_data =
- llvm_ir::EmitIfThenElse(condition, "", ir_builder_);
+ llvm_ir::EmitIfThenElse(condition, name, ir_builder_);
ir_builder_->SetInsertPoint(&if_data.true_block->back());
TF_RETURN_IF_ERROR(true_block_generator());
ir_builder_->SetInsertPoint(&if_data.false_block->back());
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 6f7a9d94e3..9d770cc4c3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -203,16 +203,30 @@ class KernelSupportLibrary {
// `true_block_generator()`;
// else
// `false_block_generator()`;
- Status If(llvm::Value* condition,
+ Status If(tensorflow::StringPiece name, llvm::Value* condition,
const std::function<Status()>& true_block_generator,
const std::function<Status()>& false_block_generator =
[]() -> Status { return Status::OK(); });
+ Status If(llvm::Value* condition,
+ const std::function<Status()>& true_block_generator,
+ const std::function<Status()>& false_block_generator =
+ []() -> Status { return Status::OK(); }) {
+ return If("", condition, true_block_generator, false_block_generator);
+ }
+
void IfReturnVoid(llvm::Value* condition,
const std::function<void()>& true_block_generator,
const std::function<void()>& false_block_generator = []() {
}) {
- TF_CHECK_OK(If(condition,
+ IfReturnVoid("", condition, true_block_generator, false_block_generator);
+ }
+
+ void IfReturnVoid(tensorflow::StringPiece name, llvm::Value* condition,
+ const std::function<void()>& true_block_generator,
+ const std::function<void()>& false_block_generator = []() {
+ }) {
+ TF_CHECK_OK(If(name, condition,
[&]() {
true_block_generator();
return Status::OK();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
new file mode 100644
index 0000000000..533b75cdae
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace llvm_ir {
+
+namespace {
+// Returns the indices of the first elements of all consecutive subarrays of the
+// given array. For example:
+// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4}
+std::vector<size_t> ConsecutiveSegments(tensorflow::gtl::ArraySlice<int64> xs) {
+ std::vector<size_t> is = {0};
+ for (size_t i = 1; i < xs.size(); ++i) {
+ if (1 != xs[i] - xs[i - 1]) {
+ is.push_back(i);
+ }
+ }
+ return is;
+}
+
+// Merges the sequences of dimensions of the given shape which start at the
+// given indices `segs`.
+Shape MergeDimensions(tensorflow::gtl::ArraySlice<size_t> segs,
+ const Shape& shape) {
+ std::vector<int64> dimensions;
+ for (size_t i = 1; i <= segs.size(); ++i) {
+ dimensions.push_back(std::accumulate(
+ shape.dimensions().begin() + segs[i - 1],
+ shape.dimensions().begin() +
+ (segs.size() == i ? shape.dimensions().size() : segs[i]),
+ 1, std::multiplies<int64>()));
+ }
+ return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+ dimensions);
+}
+} // namespace
+
+tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(
+ const Shape& a, const Shape& b) {
+ if (!ShapeUtil::CompatibleIgnoringElementType(a, b)) {
+ return tensorflow::gtl::nullopt;
+ }
+
+ std::vector<int64> perm(a.dimensions().size());
+ {
+ auto layout_a_orig = LayoutUtil::MinorToMajor(a);
+ std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
+ auto layout_b_orig = LayoutUtil::MinorToMajor(b);
+ std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
+ for (size_t i = 0; i < perm.size(); ++i) {
+ perm[i] = PositionInContainer(layout_b, layout_a[i]);
+ }
+ }
+ auto segs = ConsecutiveSegments(perm);
+ if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) {
+ Shape norm_a =
+ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
+ Shape reduced_a = MergeDimensions(segs, norm_a);
+ auto reduced_a_dims = reduced_a.dimensions();
+ std::vector<int64> dims_021;
+ if (2 == segs.size()) {
+ // The logical component-0 is of size one.
+ dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]};
+ } else {
+ dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]};
+ }
+
+ return dims_021;
+ }
+
+ return tensorflow::gtl::nullopt;
+}
+
+IrArray::Index GetUnreducedOutputIndex(
+ const IrArray::Index& reduced_output_index,
+ const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
+ llvm::IRBuilder<>* ir_builder) {
+ auto bounds = reduced_output_shape.dimensions();
+ auto minor_to_major = reduced_output_shape.layout().minor_to_major();
+ llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
+ int64 multiplier = 1;
+ for (int i = 0; i < reduced_output_index.size(); ++i) {
+ int64 dim = minor_to_major[i];
+ llvm::Value* addend = ir_builder->CreateMul(
+ reduced_output_index[dim],
+ reduced_output_index.GetConstantWithIndexType(multiplier),
+ "linearizing",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ linear_index = ir_builder->CreateAdd(linear_index, addend, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ multiplier *= bounds[dim];
+ }
+
+ return IrArray::Index(linear_index, unreduced_output_shape, ir_builder);
+}
+
+} // namespace llvm_ir
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
new file mode 100644
index 0000000000..6f1268fffb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_
+
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// About 0-2-1 transpose:
+//
+// If a shape can be viewed as three logical components 0-1-2 in the order of
+// major to minor, a 0-2-1-transpose changes the order of such logical
+// components to 0-2-1. We call the shape being transposed the input shape and
+// the transposed shape the output shape. The logical view of the input and
+// output shapes for the transpose are called the 0-1-2 shape or reduced input
+// shape and the 0-2-1 shape or the reduced output shape respectively. The
+// original input and output shapes are called the unreduced input and output
+// shapes.
+
+// If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
+// reduced shape of `b` or the 0-2-1 shape.
+tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
+ const Shape& b);
+
+// Return the unreduced output index corresponding to the given reduced output
+// index.
+IrArray::Index GetUnreducedOutputIndex(
+ const IrArray::Index& reduced_output_index,
+ const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
+ llvm::IRBuilder<>* ir_builder);
+
+// A class to represent information for tiled parameters to support IR emission
+// for 021 transpose.
+class TiledParameterInfo {
+ public:
+ TiledParameterInfo(tensorflow::gtl::ArraySlice<llvm::Value*> param_buffers,
+ llvm::Value* y, llvm::Value* x)
+ : param_buffers_(param_buffers), y_(y), x_(x) {}
+
+ llvm::Value* x() const { return x_; }
+ llvm::Value* y() const { return y_; }
+
+ void set_x(llvm::Value* x) { x_ = x; }
+ void set_y(llvm::Value* y) { y_ = y; }
+
+ llvm::Value* GetBufferForParameter(int64 index) const {
+ return param_buffers_[index];
+ }
+
+ private:
+ // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr
+ // if the parameter is not tiled.
+ tensorflow::gtl::ArraySlice<llvm::Value*> param_buffers_;
+ // The y coordinate within a tile.
+ llvm::Value* y_;
+ // The x coordinate within a tile.
+ llvm::Value* x_;
+};
+
+} // namespace llvm_ir
+} // namespace xla
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_