diff options
author | Benjamin Kramer <kramerb@google.com> | 2018-06-26 07:31:50 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-26 07:34:46 -0700 |
commit | 5f2c44f6bb5507b84c8a11e9f614c2770f620032 (patch) | |
tree | ad4b28bb79ebd3fee39f019fe67ad91e50c014e1 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | |
parent | c5feedabcebe67ea6c72402832ef9fd25c560446 (diff) |
[XLA:GPU] Make the input-fused reduce emitter work on 16-bit types
There's a bunch of things going on here:
- BuildInitializerThunk threw away half of 16 bit init values. Fix that.
- Make HandleFusion verify that it gets input-fusible reduces
- Fuse BF16 again in multi-output fusion. This was a workaround for the initializer bug
- Drop the 32 bit requirement from unfused reduce emission. It is really confusing to have different code paths for fused and unfused reduces
- Emit 8/16 integer bit add/min/max as CAS.
This is somewhat covered by existing tests.
PiperOrigin-RevId: 202125572
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index d7966e2e84..fbd647f251 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -615,6 +615,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { output_shape_index = {i}; } if (inst->opcode() == HloOpcode::kReduce) { + CHECK(IsReductionToVector(*inst)) + << "Only reductions to vector are supported"; // Shapes, layouts and dimensions must be the same for all reduces // inside of this fusion. CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape())); @@ -1970,10 +1972,8 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) { HloComputation* reducer = reduce->to_apply(); // HandleReduce specializes reduction from a multi-dimensional array to a 1D // array. The specialized version requires an initializer thunk that - // ingitializes the output array to the initial value of the reduce. - if (IsReductionToVector(*reduce) && - // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits - 32 <= primitive_util::BitWidth(reduce->shape().element_type())) { + // initializes the output array to the initial value of the reduce. + if (IsReductionToVector(*reduce)) { TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk, BuildInitializerThunk(reduce)); std::vector<std::unique_ptr<Thunk>> thunks; @@ -2715,7 +2715,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk( uint8 b = literal_bytes.front(); pattern16 = uint16{b} | (uint16{b} << 8); } else { - pattern16 = literal_bytes.front(); + memcpy(&pattern16, literal_bytes.data(), sizeof(pattern16)); } uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16); return {MakeUnique<Memset32BitValueThunk>( |