aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
diff options
context:
space:
mode:
authorGravatar Benjamin Kramer <kramerb@google.com>2018-06-26 07:31:50 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-26 07:34:46 -0700
commit5f2c44f6bb5507b84c8a11e9f614c2770f620032 (patch)
treead4b28bb79ebd3fee39f019fe67ad91e50c014e1 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parentc5feedabcebe67ea6c72402832ef9fd25c560446 (diff)
[XLA:GPU] Make the input-fused reduce emitter work on 16-bit types
There's a bunch of things going on here: - BuildInitializerThunk threw away half of 16 bit init values. Fix that. - Make HandleFusion verify that it gets input-fusible reduces - Fuse BF16 again in multi-output fusion. This was a workaround for the initializer bug - Drop the 32 bit requirement from unfused reduce emission. It is really confusing to have different code paths for fused and unfused reduces - Emit 8/16 integer bit add/min/max as CAS. This is somewhat covered by existing tests. PiperOrigin-RevId: 202125572
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc10
1 files changed, 5 insertions, 5 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d7966e2e84..fbd647f251 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -615,6 +615,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
output_shape_index = {i};
}
if (inst->opcode() == HloOpcode::kReduce) {
+ CHECK(IsReductionToVector(*inst))
+ << "Only reductions to vector are supported";
// Shapes, layouts and dimensions must be the same for all reduces
// inside of this fusion.
CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
@@ -1970,10 +1972,8 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
HloComputation* reducer = reduce->to_apply();
// HandleReduce specializes reduction from a multi-dimensional array to a 1D
// array. The specialized version requires an initializer thunk that
- // ingitializes the output array to the initial value of the reduce.
- if (IsReductionToVector(*reduce) &&
- // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits
- 32 <= primitive_util::BitWidth(reduce->shape().element_type())) {
+ // initializes the output array to the initial value of the reduce.
+ if (IsReductionToVector(*reduce)) {
TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
BuildInitializerThunk(reduce));
std::vector<std::unique_ptr<Thunk>> thunks;
@@ -2715,7 +2715,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
uint8 b = literal_bytes.front();
pattern16 = uint16{b} | (uint16{b} << 8);
} else {
- pattern16 = literal_bytes.front();
+ memcpy(&pattern16, literal_bytes.data(), sizeof(pattern16));
}
uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
return {MakeUnique<Memset32BitValueThunk>(