tensorflow/compiler/xla/service/llvm_ir/ops.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/compiler/xla/service/llvm_ir/ops.h"
#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"

namespace xla {
namespace llvm_ir {

bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
                                  const BufferAssignment& assignment) {
  CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode());
  const HloInstruction* operand = dynamic_update_slice->operand(0);
  return assignment.HasTopLevelAllocation(dynamic_update_slice) &&
         assignment.HasTopLevelAllocation(operand) &&
         assignment.SharesTopLevelSlice(dynamic_update_slice, operand);
}

// Shared implementation of EmitDynamicUpdateSliceInPlace and
// EmitFusedDynamicUpdateSliceInPlace.
//
// Emits a sequential loop if launch_dimensions is null.
static Status EmitDynamicUpdateSliceInPlaceImpl(
    const Shape& update_shape, const ElementGenerator& start_indices_generator,
    ElementGenerator update_array_generator, const IrArray& output_array,
    const gpu::LaunchDimensions* launch_dimensions,
    tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) {
  const Shape& output_shape = output_array.GetShape();

  // Read start indices from start_indices_generator.
  const int64 rank = ShapeUtil::Rank(output_shape);
  IrArray::Index start_index(ir_builder->getInt64Ty(), rank);
  for (int64 i = 0; i < rank; ++i) {
    IrArray::Index dim_index({ir_builder->getInt64(i)});
    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
    llvm::Value* output_dim_size = llvm::ConstantInt::get(
        start_index[i]->getType(), output_shape.dimensions(i));
    llvm::Value* update_dim_size = llvm::ConstantInt::get(
        start_index[i]->getType(), update_shape.dimensions(i));

    // Clamp the start index so that the update region fits in the operand.
    // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)

    // TODO(b/74360564): This is implementation defined behavior, but is
    // currently respected by all implementations. Change this if we ever decide
    // to oficially document different behavior.
    llvm::Value* max_bound =
        ir_builder->CreateSub(output_dim_size, update_dim_size);
    llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
    start_index[i] = ir_builder->CreateSelect(
        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SGE, zero, start_index[i]),
        zero, start_index[i]);

    start_index[i] = ir_builder->CreateSelect(
        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SLE, max_bound,
                               start_index[i]),
        max_bound, start_index[i]);
  }

  auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
    // Calculate output_index, where we'll write the value from update.  For
    // each dimension,
    //
    //   output_index[dim] = start_index[dim] + update_index[dim]
    //
    IrArray::Index output_index(start_index.GetType(), rank);
    for (int64 i = 0; i < rank; ++i) {
      llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast(
          start_index[i], update_index[i]->getType());
      output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]);
    }

    // Do output[output_index] = update[update_index].
    TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
                        update_array_generator(update_index));
    output_array.EmitWriteArrayElement(output_index, update_data, ir_builder);
    return Status::OK();
  };

  if (launch_dimensions != nullptr) {
    return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape,
                                    *launch_dimensions, ir_builder)
        .EmitLoop(name);
  }
  return LoopEmitter(loop_body_emitter, update_shape, ir_builder)
      .EmitLoop(name);
}

Status EmitDynamicUpdateSliceInPlace(
    tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
    const IrArray& output_array, tensorflow::StringPiece name,
    llvm::IRBuilder<>* ir_builder) {
  VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;

  // No need to use operand_arrays[0], the input array of the
  // dynamic-update-slice, because we know it aliases the op's output.
  IrArray update_array = operand_arrays[1];
  IrArray start_indices_array = operand_arrays[2];
  Shape output_shape = output_array.GetShape();
  Shape update_shape = update_array.GetShape();

  ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
    return start_indices_array.EmitReadArrayElement(index, ir_builder);
  };
  ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
    return update_array.EmitReadArrayElement(index, ir_builder);
  };

  return EmitDynamicUpdateSliceInPlaceImpl(
      update_shape, start_indices_generator, update_array_generator,
      output_array, /*launch_dimensions=*/nullptr, name, ir_builder);
}

// Shared implementation for EmitFusedDynamicUpdateSliceInPlace and
// EmitParallelFusedDynamicUpdateSliceInPlace.
//
// Emits a sequential loop if launch_dimensions is null.
static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
    HloInstruction* fusion,
    tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
    const gpu::LaunchDimensions* launch_dimensions,
    llvm::IRBuilder<>* ir_builder) {
  CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
  VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
          << fusion->ToShortString();

  auto* dynamic_update_slice = fusion->fused_expression_root();

  const auto* update = dynamic_update_slice->operand(1);
  const auto* start_indices = dynamic_update_slice->operand(2);
  Shape update_shape = update->shape();

  // Our in-place dynamic-update-slice implementation emits a loop over
  // update_shape.  To emit a cache-friendly loop, we need to know that shape's
  // layout.
  //
  // update_shape is inside a fusion node -- it's never materialized in memory
  // and thus doesn't have a layout.  In this case we use the layout of the
  // fusion node for iteration, since that corresponds to the order in memory of
  // the buffer we'll be writing to.
  //
  // (This isn't necessarily optimal; in some cases it might be faster to peek
  // through the chain of ops that gives us the update operand and use the
  // layout of its source buffer(s).  But this is no worse than we do with
  // fusion elsewhere.)
  TF_RETURN_IF_ERROR(
      LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));

  // Create element generators for update and start_indices.
  FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter);
  TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
  ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
  ElementGenerator start_indices_generator =
      fused_emitter.GetGenerator(start_indices);

  return EmitDynamicUpdateSliceInPlaceImpl(
      update_shape, start_indices_generator, update_array_generator,
      fusion_output_array, launch_dimensions, IrName(fusion), ir_builder);
}

Status EmitFusedDynamicUpdateSliceInPlace(
    HloInstruction* fusion,
    tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
    llvm::IRBuilder<>* ir_builder) {
  return EmitFusedDynamicUpdateSliceInPlaceImpl(
      fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
      /*launch_dimensions=*/nullptr, ir_builder);
}

Status EmitParallelFusedDynamicUpdateSliceInPlace(
    HloInstruction* fusion,
    tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
    const gpu::LaunchDimensions& launch_dimensions,
    llvm::IRBuilder<>* ir_builder) {
  return EmitFusedDynamicUpdateSliceInPlaceImpl(
      fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
      &launch_dimensions, ir_builder);
}

}  // namespace llvm_ir
}  // namespace xla