aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc')
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc197
1 files changed, 197 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
new file mode 100644
index 0000000000..27fbb11e2e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -0,0 +1,197 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
+#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+namespace llvm_ir {
+
+bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
+ const BufferAssignment& assignment) {
+ CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode());
+ const HloInstruction* operand = dynamic_update_slice->operand(0);
+ return assignment.HasTopLevelAllocation(dynamic_update_slice) &&
+ assignment.HasTopLevelAllocation(operand) &&
+ assignment.SharesTopLevelSlice(dynamic_update_slice, operand);
+}
+
+// Shared implementation of EmitDynamicUpdateSliceInPlace and
+// EmitFusedDynamicUpdateSliceInPlace.
+//
+// Emits a sequential loop if launch_dimensions is null.
+static Status EmitDynamicUpdateSliceInPlaceImpl(
+ const Shape& update_shape, const ElementGenerator& start_indices_generator,
+ bool is_signed, ElementGenerator update_array_generator,
+ const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
+ tensorflow::StringPiece name, llvm::IRBuilder<>* b) {
+ const Shape& output_shape = output_array.GetShape();
+
+ // Read start indices from start_indices_generator.
+ const int64 rank = ShapeUtil::Rank(output_shape);
+ IrArray::Index start_index(b->getInt64Ty(), rank);
+ for (int64 i = 0; i < rank; ++i) {
+ IrArray::Index dim_index({b->getInt64(i)});
+ TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+ llvm::Value* output_dim_size = llvm::ConstantInt::get(
+ start_index[i]->getType(), output_shape.dimensions(i));
+ llvm::Value* update_dim_size = llvm::ConstantInt::get(
+ start_index[i]->getType(), update_shape.dimensions(i));
+
+ // Clamp the start index so that the update region fits in the operand.
+ // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
+ llvm::Value* max_bound = b->CreateSub(output_dim_size, update_dim_size);
+ llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
+ start_index[i] =
+ b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+ : llvm::ICmpInst::ICMP_UGE,
+ zero, start_index[i]),
+ zero, start_index[i]);
+
+ start_index[i] =
+ b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+ : llvm::ICmpInst::ICMP_ULE,
+ max_bound, start_index[i]),
+ max_bound, start_index[i]);
+ }
+
+ auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
+ // Calculate output_index, where we'll write the value from update. For
+ // each dimension,
+ //
+ // output_index[dim] = start_index[dim] + update_index[dim]
+ //
+ IrArray::Index output_index(start_index.GetType(), rank);
+ for (int64 i = 0; i < rank; ++i) {
+ llvm::Value* start_index0 =
+ b->CreateSExtOrBitCast(start_index[i], update_index[i]->getType());
+ output_index[i] = b->CreateAdd(start_index0, update_index[i]);
+ }
+
+ // Do output[output_index] = update[update_index].
+ TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
+ update_array_generator(update_index));
+ output_array.EmitWriteArrayElement(output_index, update_data, b);
+ return Status::OK();
+ };
+
+ if (launch_dimensions != nullptr) {
+ return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape,
+ *launch_dimensions, b)
+ .EmitLoop(name);
+ }
+ return LoopEmitter(loop_body_emitter, update_shape, b).EmitLoop(name);
+}
+
+Status EmitDynamicUpdateSliceInPlace(
+ tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
+ const IrArray& output_array, tensorflow::StringPiece name,
+ llvm::IRBuilder<>* b) {
+ VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
+
+ // No need to use operand_arrays[0], the input array of the
+ // dynamic-update-slice, because we know it aliases the op's output.
+ IrArray update_array = operand_arrays[1];
+ IrArray start_indices_array = operand_arrays[2];
+ Shape output_shape = output_array.GetShape();
+ Shape update_shape = update_array.GetShape();
+
+ ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
+ return start_indices_array.EmitReadArrayElement(index, b);
+ };
+ ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
+ return update_array.EmitReadArrayElement(index, b);
+ };
+
+ bool is_signed = ShapeUtil::ElementIsSigned(start_indices_array.GetShape());
+ return EmitDynamicUpdateSliceInPlaceImpl(
+ update_shape, start_indices_generator, is_signed, update_array_generator,
+ output_array, /*launch_dimensions=*/nullptr, name, b);
+}
+
+// Shared implementation for EmitFusedDynamicUpdateSliceInPlace and
+// EmitParallelFusedDynamicUpdateSliceInPlace.
+//
+// Emits a sequential loop if launch_dimensions is null.
+static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
+ HloInstruction* fusion,
+ tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
+ const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+ const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) {
+ CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
+ VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
+ << fusion->ToShortString();
+
+ auto* dynamic_update_slice = fusion->fused_expression_root();
+
+ const auto* update = dynamic_update_slice->operand(1);
+ const auto* start_indices = dynamic_update_slice->operand(2);
+ Shape update_shape = update->shape();
+
+ // Our in-place dynamic-update-slice implementation emits a loop over
+ // update_shape. To emit a cache-friendly loop, we need to know that shape's
+ // layout.
+ //
+ // update_shape is inside a fusion node -- it's never materialized in memory
+ // and thus doesn't have a layout. In this case we use the layout of the
+ // fusion node for iteration, since that corresponds to the order in memory of
+ // the buffer we'll be writing to.
+ //
+ // (This isn't necessarily optimal; in some cases it might be faster to peek
+ // through the chain of ops that gives us the update operand and use the
+ // layout of its source buffer(s). But this is no worse than we do with
+ // fusion elsewhere.)
+ TF_RETURN_IF_ERROR(
+ LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));
+
+ // Create element generators for update and start_indices.
+ FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter);
+ TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
+ ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
+ ElementGenerator start_indices_generator =
+ fused_emitter.GetGenerator(start_indices);
+
+ bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
+ return EmitDynamicUpdateSliceInPlaceImpl(
+ update_shape, start_indices_generator, is_signed, update_array_generator,
+ fusion_output_array, launch_dimensions, IrName(fusion), b);
+}
+
+Status EmitFusedDynamicUpdateSliceInPlace(
+ HloInstruction* fusion,
+ tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
+ const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+ llvm::IRBuilder<>* b) {
+ return EmitFusedDynamicUpdateSliceInPlaceImpl(
+ fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
+ /*launch_dimensions=*/nullptr, b);
+}
+
+Status EmitParallelFusedDynamicUpdateSliceInPlace(
+ HloInstruction* fusion,
+ tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
+ const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+ const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) {
+ return EmitFusedDynamicUpdateSliceInPlaceImpl(
+ fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
+ &launch_dimensions, b);
+}
+
+} // namespace llvm_ir
+} // namespace xla