1 files changed, 396 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
new file mode 100644
index 0000000000..e318ade5ee
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -0,0 +1,396 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+
+#include <stddef.h>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+// IWYU pragma: no_include "llvm/IR/Attributes.gen.inc"
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "external/llvm/include/llvm/ADT/APInt.h"
+#include "external/llvm/include/llvm/IR/BasicBlock.h"
+#include "external/llvm/include/llvm/IR/Instructions.h"
+#include "external/llvm/include/llvm/IR/Intrinsics.h"
+#include "external/llvm/include/llvm/IR/Module.h"
+#include "external/llvm/include/llvm/IR/Type.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gpu {
+
+using llvm_ir::IrArray;
+using llvm_ir::SetToFirstInsertPoint;
+
+GpuElementalIrEmitter::GpuElementalIrEmitter(
+    const HloModuleConfig& hlo_module_config, llvm::Module* module,
+    llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested)
+    : ElementalIrEmitter(hlo_module_config, module, ir_builder),
+      compute_nested_(std::move(compute_nested)) {}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
+    const string& callee_name,
+    tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+    tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+    PrimitiveType output_type) const {
+  // Binary math functions tranform are of type [T] -> T.
+  for (PrimitiveType input_type : input_types) {
+    if (output_type != input_type) {
+      return Unimplemented("Input type ≠ output type: %s ≠ %s",
+                           PrimitiveType_Name(input_type).c_str(),
+                           PrimitiveType_Name(output_type).c_str());
+    }
+  }
+
+  // The libdevice math functions differentiate between "double" and "float" by
+  // appending an 'f' to the function's name.
+  string function_name = callee_name;
+  switch (output_type) {
+    case F32:
+      function_name += 'f';
+      break;
+    case F64:
+      break;
+    default:
+      return Unimplemented("Bad type for math call: %s",
+                           PrimitiveType_Name(output_type).c_str());
+  }
+
+  return EmitDeviceFunctionCall(
+      function_name, operands, input_types, output_type,
+      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind});
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
+    const HloInstruction* op, llvm::Value* lhs_value,
+    llvm::Value* rhs_value) const {
+  PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
+  PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
+  PrimitiveType output_type = op->shape().element_type();
+  switch (op->opcode()) {
+    case HloOpcode::kRemainder: {
+      return EmitMathCall("__nv_fmod", {lhs_value, rhs_value},
+                          {lhs_input_type, rhs_input_type}, output_type);
+    }
+    case HloOpcode::kPower: {
+      return EmitMathCall("__nv_pow", {lhs_value, rhs_value},
+                          {lhs_input_type, rhs_input_type}, output_type);
+    }
+    default:
+      return ElementalIrEmitter::EmitFloatBinaryOp(op, lhs_value, rhs_value);
+  }
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErfcInv(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
+    const HloInstruction* op, llvm::Value* operand_value) const {
+  PrimitiveType input_type = op->operand(0)->shape().element_type();
+  PrimitiveType output_type = op->shape().element_type();
+  switch (op->opcode()) {
+    case HloOpcode::kExp:
+      return EmitMathCall("__nv_exp", {operand_value}, {input_type},
+                          output_type);
+    case HloOpcode::kFloor:
+      return EmitMathCall("__nv_floor", {operand_value}, {input_type},
+                          output_type);
+    case HloOpcode::kCeil:
+      return EmitMathCall("__nv_ceil", {operand_value}, {input_type},
+                          output_type);
+    case HloOpcode::kLog:
+      return EmitMathCall("__nv_log", {operand_value}, {input_type},
+                          output_type);
+    case HloOpcode::kTanh:
+      return EmitMathCall("__nv_tanh", {operand_value}, {input_type},
+                          output_type);
+    default:
+      return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
+  }
+}
+
+llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
+    const string& callee_name,
+    tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+    tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+    PrimitiveType output_type,
+    tensorflow::gtl::ArraySlice<llvm::Attribute::AttrKind> attributes) const {
+  std::vector<llvm::Type*> ir_input_types;
+  for (PrimitiveType input_type : input_types) {
+    ir_input_types.push_back(
+        llvm_ir::PrimitiveTypeToIrType(input_type, ir_builder_));
+  }
+  llvm::FunctionType* callee_type = llvm::FunctionType::get(
+      llvm_ir::PrimitiveTypeToIrType(output_type,
+                                     ir_builder_),  // The return type.
+      ir_input_types,                               // The parameter types.
+      false);                                       // No variadic arguments.
+
+  // Declares the callee if it is not declared already.
+  llvm::Function* callee = llvm::cast<llvm::Function>(
+      ir_builder_->GetInsertBlock()->getModule()->getOrInsertFunction(
+          llvm_ir::AsStringRef(callee_name), callee_type));
+
+  for (auto attribute : attributes) {
+    callee->addFnAttr(attribute);
+  }
+
+  return ir_builder_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
+}
+
+llvm::Value* GpuElementalIrEmitter::EmitThreadId() const {
+  llvm::Value* block_id = ir_builder_->CreateIntCast(
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
+                                   {}, {}, ir_builder_),
+      ir_builder_->getIntNTy(128), /*isSigned=*/true, "block.id");
+  llvm::Value* thread_id_in_block = ir_builder_->CreateIntCast(
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x,
+                                   {}, {}, ir_builder_),
+      ir_builder_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+  llvm::Value* threads_per_block = ir_builder_->CreateIntCast(
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
+                                   {}, {}, ir_builder_),
+      ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+  return ir_builder_->CreateNSWAdd(
+      ir_builder_->CreateNSWMul(block_id, threads_per_block),
+      thread_id_in_block);
+}
+
+llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
+    const HloInstruction* hlo,
+    const HloToElementGeneratorMap& operand_to_generator) const {
+  switch (hlo->opcode()) {
+    case HloOpcode::kPad:
+      return [=, &operand_to_generator](
+                 const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
+        auto index = padded_index;
+        llvm::Value* in_bounds =
+            llvm::ConstantInt::get(ir_builder_->getInt1Ty(), 1);
+        for (int i = 0; i < index.size(); ++i) {
+          auto index_typed_const = [=](int64 n) {
+            return llvm::ConstantInt::get(index[i]->getType(), n);
+          };
+          const auto& pad_dim = hlo->padding_config().dimensions(i);
+          index[i] = ir_builder_->CreateSub(
+              index[i], index_typed_const(pad_dim.edge_padding_low()));
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
+              "in_bounds");
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpEQ(
+                  index_typed_const(0),
+                  ir_builder_->CreateURem(
+                      index[i],
+                      index_typed_const(pad_dim.interior_padding() + 1))),
+              "in_bounds");
+          index[i] = ir_builder_->CreateSDiv(
+              index[i], index_typed_const(pad_dim.interior_padding() + 1));
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpSLT(
+                  index[i],
+                  index_typed_const(hlo->operand(0)->shape().dimensions(i))),
+              "in_bounds");
+        }
+
+        // if (in_bounds) {
+        //   ret_value = operand0[index];  // source
+        // } else {
+        //   ret_value = *operand1;        // padding
+        // }
+        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
+                                           ir_builder_),
+            "pad_result_addr", ir_builder_);
+        llvm_ir::LlvmIfData if_data =
+            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                            operand_to_generator.at(hlo->operand(0))(index));
+        ir_builder_->CreateStore(operand_value, ret_value_addr);
+
+        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+        TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
+                            operand_to_generator.at(hlo->operand(1))({}));
+        ir_builder_->CreateStore(padding_value, ret_value_addr);
+
+        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+        // Don't create phi(operand_value, padding_value) here, because invoking
+        // operand_to_generator may create new basic blocks, making the parent
+        // of operand_value or padding_value no longer a predecessor of
+        // if_data.after_block.
+        return ir_builder_->CreateLoad(ret_value_addr);
+      };
+    case HloOpcode::kMap:
+      return [=, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(!hlo->operands().empty())
+            << "Zero operand map not implemented in GPU backend.";
+        TF_RET_CHECK(hlo->to_apply()->num_parameters() > 0);
+        std::vector<llvm::Value*> operand_elements;
+        for (HloInstruction* operand : hlo->operands()) {
+          TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                              operand_to_generator.at(operand)(index));
+          operand_elements.push_back(value);
+        }
+        return compute_nested_(*hlo->to_apply(), operand_elements);
+      };
+    case HloOpcode::kReduceWindow:
+      // Pseudocode:
+      // for each index I in output
+      //   value = init_value
+      //   for each index W in window
+      //     for each dimension i from 0 to rank - 1
+      //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
+      //     if I in bounds of input
+      //       value = function(value, input[I])
+      //     output[O] = value
+      return [=, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        const HloInstruction* operand = hlo->operand(0);
+        const Window& window = hlo->window();
+
+        // TODO(b/31410564): Implement dilation for reduce-window.
+        if (window_util::HasDilation(window)) {
+          return Unimplemented(
+              "Dilation for reduce-window not implemented on GPU. "
+              "See b/31410564.");
+        }
+
+        PrimitiveType operand_element_type = operand->shape().element_type();
+        llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
+            llvm_ir::PrimitiveTypeToIrType(operand_element_type, ir_builder_),
+            "reduce_window_accum_ptr", ir_builder_);
+        {
+          TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
+                              operand_to_generator.at(hlo->operand(1))({}));
+          ir_builder_->CreateStore(init_value, accum_ptr);
+        }
+
+        llvm_ir::ForLoopNest loops(ir_builder_);
+        std::vector<int64> window_size;
+        for (const auto& dim : window.dimensions()) {
+          window_size.push_back(dim.size());
+        }
+        const IrArray::Index window_index = loops.AddLoopsForShape(
+            ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+        CHECK_EQ(window_index.size(), index.size());
+
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_);
+
+        IrArray::Index input_index(index.size());
+        llvm::Value* in_bounds = ir_builder_->getInt1(1);
+        for (size_t i = 0; i < index.size(); ++i) {
+          llvm::Value* stridden_index = ir_builder_->CreateNSWMul(
+              index[i], ir_builder_->getInt64(window.dimensions(i).stride()));
+          input_index[i] = ir_builder_->CreateNSWSub(
+              ir_builder_->CreateNSWAdd(stridden_index, window_index[i]),
+              ir_builder_->getInt64(window.dimensions(i).padding_low()));
+
+          // We must check whether 0 ≤ input_index[i] < bound, as otherwise
+          // we are in the pad and so can skip the computation. This
+          // comparison is equivalent to the unsigned comparison
+          // input_index[i] < bound, as a negative value wraps to a large
+          // positive value.
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpULT(
+                  input_index[i],
+                  ir_builder_->getInt64(operand->shape().dimensions(i))));
+        }
+
+        llvm_ir::LlvmIfData if_data =
+            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+
+        // We are not in pad, so do the computation.
+        TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
+                            operand_to_generator.at(operand)(input_index));
+        TF_ASSIGN_OR_RETURN(
+            llvm::Value * accum_value,
+            compute_nested_(*hlo->to_apply(),
+                            {ir_builder_->CreateLoad(accum_ptr), input_value}));
+        ir_builder_->CreateStore(accum_value, accum_ptr);
+
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder_);
+        return ir_builder_->CreateLoad(accum_ptr);
+      };
+    case HloOpcode::kReduce:
+      return [=, &operand_to_generator](
+                 const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
+        const HloInstruction* operand = hlo->operand(0);
+        llvm::Value* accum_ptr =
+            ir_builder()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+                hlo->shape().element_type(), ir_builder()));
+        TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
+                            operand_to_generator.at(hlo->operand(1))({}));
+        ir_builder()->CreateStore(init_value, accum_ptr);
+
+        llvm_ir::ForLoopNest loops(ir_builder_);
+        IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions(
+            operand->shape(), hlo->dimensions(), "reduction_dim");
+        if (!ShapeUtil::IsScalar(hlo->shape())) {
+          // Here only input_index[hlo->dimensions()] are non-null, so we must
+          // set the rest.
+          size_t j = 0;
+          for (size_t i = 0; i < input_index.size(); ++i) {
+            if (input_index[i] == nullptr) {
+              input_index[i] = output_index[j++];
+            }
+          }
+          CHECK_EQ(output_index.size(), j);
+        }
+
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder());
+        TF_ASSIGN_OR_RETURN(
+            llvm::Value * input_value,
+            operand_to_generator.at(hlo->operand(0))(input_index));
+        TF_ASSIGN_OR_RETURN(
+            llvm::Value * accum_value,
+            compute_nested_(
+                *hlo->to_apply(),
+                {ir_builder()->CreateLoad(accum_ptr), input_value}));
+        ir_builder()->CreateStore(accum_value, accum_ptr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder());
+        return ir_builder()->CreateLoad(accum_ptr);
+      };
+    default:
+      return ElementalIrEmitter::MakeElementGenerator(hlo,
+                                                      operand_to_generator);
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla