aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc396
1 files changed, 396 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
new file mode 100644
index 0000000000..e318ade5ee
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -0,0 +1,396 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+
+#include <stddef.h>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+// IWYU pragma: no_include "llvm/IR/Attributes.gen.inc"
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "external/llvm/include/llvm/ADT/APInt.h"
+#include "external/llvm/include/llvm/IR/BasicBlock.h"
+#include "external/llvm/include/llvm/IR/Instructions.h"
+#include "external/llvm/include/llvm/IR/Intrinsics.h"
+#include "external/llvm/include/llvm/IR/Module.h"
+#include "external/llvm/include/llvm/IR/Type.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gpu {
+
+using llvm_ir::IrArray;
+using llvm_ir::SetToFirstInsertPoint;
+
+GpuElementalIrEmitter::GpuElementalIrEmitter(
+ const HloModuleConfig& hlo_module_config, llvm::Module* module,
+ llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested)
+ : ElementalIrEmitter(hlo_module_config, module, ir_builder),
+ compute_nested_(std::move(compute_nested)) {}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
+ const string& callee_name,
+ tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+ tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+ PrimitiveType output_type) const {
+ // Binary math functions tranform are of type [T] -> T.
+ for (PrimitiveType input_type : input_types) {
+ if (output_type != input_type) {
+ return Unimplemented("Input type ≠ output type: %s ≠ %s",
+ PrimitiveType_Name(input_type).c_str(),
+ PrimitiveType_Name(output_type).c_str());
+ }
+ }
+
+ // The libdevice math functions differentiate between "double" and "float" by
+ // appending an 'f' to the function's name.
+ string function_name = callee_name;
+ switch (output_type) {
+ case F32:
+ function_name += 'f';
+ break;
+ case F64:
+ break;
+ default:
+ return Unimplemented("Bad type for math call: %s",
+ PrimitiveType_Name(output_type).c_str());
+ }
+
+ return EmitDeviceFunctionCall(
+ function_name, operands, input_types, output_type,
+ {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind});
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
+ const HloInstruction* op, llvm::Value* lhs_value,
+ llvm::Value* rhs_value) const {
+ PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
+ PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
+ PrimitiveType output_type = op->shape().element_type();
+ switch (op->opcode()) {
+ case HloOpcode::kRemainder: {
+ return EmitMathCall("__nv_fmod", {lhs_value, rhs_value},
+ {lhs_input_type, rhs_input_type}, output_type);
+ }
+ case HloOpcode::kPower: {
+ return EmitMathCall("__nv_pow", {lhs_value, rhs_value},
+ {lhs_input_type, rhs_input_type}, output_type);
+ }
+ default:
+ return ElementalIrEmitter::EmitFloatBinaryOp(op, lhs_value, rhs_value);
+ }
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErfcInv(
+ PrimitiveType prim_type, llvm::Value* value) const {
+ return EmitMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
+ const HloInstruction* op, llvm::Value* operand_value) const {
+ PrimitiveType input_type = op->operand(0)->shape().element_type();
+ PrimitiveType output_type = op->shape().element_type();
+ switch (op->opcode()) {
+ case HloOpcode::kExp:
+ return EmitMathCall("__nv_exp", {operand_value}, {input_type},
+ output_type);
+ case HloOpcode::kFloor:
+ return EmitMathCall("__nv_floor", {operand_value}, {input_type},
+ output_type);
+ case HloOpcode::kCeil:
+ return EmitMathCall("__nv_ceil", {operand_value}, {input_type},
+ output_type);
+ case HloOpcode::kLog:
+ return EmitMathCall("__nv_log", {operand_value}, {input_type},
+ output_type);
+ case HloOpcode::kTanh:
+ return EmitMathCall("__nv_tanh", {operand_value}, {input_type},
+ output_type);
+ default:
+ return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
+ }
+}
+
+llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
+ const string& callee_name,
+ tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+ tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+ PrimitiveType output_type,
+ tensorflow::gtl::ArraySlice<llvm::Attribute::AttrKind> attributes) const {
+ std::vector<llvm::Type*> ir_input_types;
+ for (PrimitiveType input_type : input_types) {
+ ir_input_types.push_back(
+ llvm_ir::PrimitiveTypeToIrType(input_type, ir_builder_));
+ }
+ llvm::FunctionType* callee_type = llvm::FunctionType::get(
+ llvm_ir::PrimitiveTypeToIrType(output_type,
+ ir_builder_), // The return type.
+ ir_input_types, // The parameter types.
+ false); // No variadic arguments.
+
+ // Declares the callee if it is not declared already.
+ llvm::Function* callee = llvm::cast<llvm::Function>(
+ ir_builder_->GetInsertBlock()->getModule()->getOrInsertFunction(
+ llvm_ir::AsStringRef(callee_name), callee_type));
+
+ for (auto attribute : attributes) {
+ callee->addFnAttr(attribute);
+ }
+
+ return ir_builder_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
+}
+
+llvm::Value* GpuElementalIrEmitter::EmitThreadId() const {
+ llvm::Value* block_id = ir_builder_->CreateIntCast(
+ llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
+ {}, {}, ir_builder_),
+ ir_builder_->getIntNTy(128), /*isSigned=*/true, "block.id");
+ llvm::Value* thread_id_in_block = ir_builder_->CreateIntCast(
+ llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x,
+ {}, {}, ir_builder_),
+ ir_builder_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+ llvm::Value* threads_per_block = ir_builder_->CreateIntCast(
+ llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
+ {}, {}, ir_builder_),
+ ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+ return ir_builder_->CreateNSWAdd(
+ ir_builder_->CreateNSWMul(block_id, threads_per_block),
+ thread_id_in_block);
+}
+
+llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
+ const HloInstruction* hlo,
+ const HloToElementGeneratorMap& operand_to_generator) const {
+ switch (hlo->opcode()) {
+ case HloOpcode::kPad:
+ return [=, &operand_to_generator](
+ const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
+ auto index = padded_index;
+ llvm::Value* in_bounds =
+ llvm::ConstantInt::get(ir_builder_->getInt1Ty(), 1);
+ for (int i = 0; i < index.size(); ++i) {
+ auto index_typed_const = [=](int64 n) {
+ return llvm::ConstantInt::get(index[i]->getType(), n);
+ };
+ const auto& pad_dim = hlo->padding_config().dimensions(i);
+ index[i] = ir_builder_->CreateSub(
+ index[i], index_typed_const(pad_dim.edge_padding_low()));
+ in_bounds = ir_builder_->CreateAnd(
+ in_bounds,
+ ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
+ "in_bounds");
+ in_bounds = ir_builder_->CreateAnd(
+ in_bounds,
+ ir_builder_->CreateICmpEQ(
+ index_typed_const(0),
+ ir_builder_->CreateURem(
+ index[i],
+ index_typed_const(pad_dim.interior_padding() + 1))),
+ "in_bounds");
+ index[i] = ir_builder_->CreateSDiv(
+ index[i], index_typed_const(pad_dim.interior_padding() + 1));
+ in_bounds = ir_builder_->CreateAnd(
+ in_bounds,
+ ir_builder_->CreateICmpSLT(
+ index[i],
+ index_typed_const(hlo->operand(0)->shape().dimensions(i))),
+ "in_bounds");
+ }
+
+ // if (in_bounds) {
+ // ret_value = operand0[index]; // source
+ // } else {
+ // ret_value = *operand1; // padding
+ // }
+ llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+ llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
+ ir_builder_),
+ "pad_result_addr", ir_builder_);
+ llvm_ir::LlvmIfData if_data =
+ llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+ SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+ TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+ operand_to_generator.at(hlo->operand(0))(index));
+ ir_builder_->CreateStore(operand_value, ret_value_addr);
+
+ SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+ TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
+ operand_to_generator.at(hlo->operand(1))({}));
+ ir_builder_->CreateStore(padding_value, ret_value_addr);
+
+ SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+ // Don't create phi(operand_value, padding_value) here, because invoking
+ // operand_to_generator may create new basic blocks, making the parent
+ // of operand_value or padding_value no longer a predecessor of
+ // if_data.after_block.
+ return ir_builder_->CreateLoad(ret_value_addr);
+ };
+ case HloOpcode::kMap:
+ return [=, &operand_to_generator](
+ const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+ TF_RET_CHECK(!hlo->operands().empty())
+ << "Zero operand map not implemented in GPU backend.";
+ TF_RET_CHECK(hlo->to_apply()->num_parameters() > 0);
+ std::vector<llvm::Value*> operand_elements;
+ for (HloInstruction* operand : hlo->operands()) {
+ TF_ASSIGN_OR_RETURN(llvm::Value * value,
+ operand_to_generator.at(operand)(index));
+ operand_elements.push_back(value);
+ }
+ return compute_nested_(*hlo->to_apply(), operand_elements);
+ };
+ case HloOpcode::kReduceWindow:
+ // Pseudocode:
+ // for each index I in output
+ // value = init_value
+ // for each index W in window
+ // for each dimension i from 0 to rank - 1
+ // (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
+ // if I in bounds of input
+ // value = function(value, input[I])
+ // output[O] = value
+ return [=, &operand_to_generator](
+ const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+ const HloInstruction* operand = hlo->operand(0);
+ const Window& window = hlo->window();
+
+ // TODO(b/31410564): Implement dilation for reduce-window.
+ if (window_util::HasDilation(window)) {
+ return Unimplemented(
+ "Dilation for reduce-window not implemented on GPU. "
+ "See b/31410564.");
+ }
+
+ PrimitiveType operand_element_type = operand->shape().element_type();
+ llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
+ llvm_ir::PrimitiveTypeToIrType(operand_element_type, ir_builder_),
+ "reduce_window_accum_ptr", ir_builder_);
+ {
+ TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
+ operand_to_generator.at(hlo->operand(1))({}));
+ ir_builder_->CreateStore(init_value, accum_ptr);
+ }
+
+ llvm_ir::ForLoopNest loops(ir_builder_);
+ std::vector<int64> window_size;
+ for (const auto& dim : window.dimensions()) {
+ window_size.push_back(dim.size());
+ }
+ const IrArray::Index window_index = loops.AddLoopsForShape(
+ ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+ CHECK_EQ(window_index.size(), index.size());
+
+ SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_);
+
+ IrArray::Index input_index(index.size());
+ llvm::Value* in_bounds = ir_builder_->getInt1(1);
+ for (size_t i = 0; i < index.size(); ++i) {
+ llvm::Value* stridden_index = ir_builder_->CreateNSWMul(
+ index[i], ir_builder_->getInt64(window.dimensions(i).stride()));
+ input_index[i] = ir_builder_->CreateNSWSub(
+ ir_builder_->CreateNSWAdd(stridden_index, window_index[i]),
+ ir_builder_->getInt64(window.dimensions(i).padding_low()));
+
+ // We must check whether 0 ≤ input_index[i] < bound, as otherwise
+ // we are in the pad and so can skip the computation. This
+ // comparison is equivalent to the unsigned comparison
+ // input_index[i] < bound, as a negative value wraps to a large
+ // positive value.
+ in_bounds = ir_builder_->CreateAnd(
+ in_bounds,
+ ir_builder_->CreateICmpULT(
+ input_index[i],
+ ir_builder_->getInt64(operand->shape().dimensions(i))));
+ }
+
+ llvm_ir::LlvmIfData if_data =
+ llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+ SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+
+ // We are not in pad, so do the computation.
+ TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
+ operand_to_generator.at(operand)(input_index));
+ TF_ASSIGN_OR_RETURN(
+ llvm::Value * accum_value,
+ compute_nested_(*hlo->to_apply(),
+ {ir_builder_->CreateLoad(accum_ptr), input_value}));
+ ir_builder_->CreateStore(accum_value, accum_ptr);
+
+ SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder_);
+ return ir_builder_->CreateLoad(accum_ptr);
+ };
+ case HloOpcode::kReduce:
+ return [=, &operand_to_generator](
+ const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
+ const HloInstruction* operand = hlo->operand(0);
+ llvm::Value* accum_ptr =
+ ir_builder()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+ hlo->shape().element_type(), ir_builder()));
+ TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
+ operand_to_generator.at(hlo->operand(1))({}));
+ ir_builder()->CreateStore(init_value, accum_ptr);
+
+ llvm_ir::ForLoopNest loops(ir_builder_);
+ IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions(
+ operand->shape(), hlo->dimensions(), "reduction_dim");
+ if (!ShapeUtil::IsScalar(hlo->shape())) {
+ // Here only input_index[hlo->dimensions()] are non-null, so we must
+ // set the rest.
+ size_t j = 0;
+ for (size_t i = 0; i < input_index.size(); ++i) {
+ if (input_index[i] == nullptr) {
+ input_index[i] = output_index[j++];
+ }
+ }
+ CHECK_EQ(output_index.size(), j);
+ }
+
+ SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder());
+ TF_ASSIGN_OR_RETURN(
+ llvm::Value * input_value,
+ operand_to_generator.at(hlo->operand(0))(input_index));
+ TF_ASSIGN_OR_RETURN(
+ llvm::Value * accum_value,
+ compute_nested_(
+ *hlo->to_apply(),
+ {ir_builder()->CreateLoad(accum_ptr), input_value}));
+ ir_builder()->CreateStore(accum_value, accum_ptr);
+ SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder());
+ return ir_builder()->CreateLoad(accum_ptr);
+ };
+ default:
+ return ElementalIrEmitter::MakeElementGenerator(hlo,
+ operand_to_generator);
+ }
+}
+
+} // namespace gpu
+} // namespace xla