[XLA:GPU] Make the input-fused reduce emitter work on 16-bit types

There's a bunch of things going on here: - BuildInitializerThunk threw away half of 16 bit init values. Fix that. - Make HandleFusion verify that it gets input-fusible reduces - Fuse BF16 again in multi-output fusion. This was a workaround for the initializer bug - Drop the 32 bit requirement from unfused reduce emission. It is really confusing to have different code paths for fused and unfused reduces - Emit 8/16 integer bit add/min/max as CAS. This is somewhat covered by existing tests. PiperOrigin-RevId: 202125572
author: Benjamin Kramer <kramerb@google.com> 2018-06-26 07:31:50 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-06-26 07:34:46 -0700
commit: 5f2c44f6bb5507b84c8a11e9f614c2770f620032 (patch)
tree: ad4b28bb79ebd3fee39f019fe67ad91e50c014e1 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parent: c5feedabcebe67ea6c72402832ef9fd25c560446 (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d7966e2e84..fbd647f251 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -615,6 +615,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
             output_shape_index = {i};
           }
           if (inst->opcode() == HloOpcode::kReduce) {
+            CHECK(IsReductionToVector(*inst))
+                << "Only reductions to vector are supported";
             // Shapes, layouts and dimensions must be the same for all reduces
             // inside of this fusion.
             CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
@@ -1970,10 +1972,8 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
   HloComputation* reducer = reduce->to_apply();
   // HandleReduce specializes reduction from a multi-dimensional array to a 1D
   // array. The specialized version requires an initializer thunk that
-  // ingitializes the output array to the initial value of the reduce.
-  if (IsReductionToVector(*reduce) &&
-      // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits
-      32 <= primitive_util::BitWidth(reduce->shape().element_type())) {
+  // initializes the output array to the initial value of the reduce.
+  if (IsReductionToVector(*reduce)) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
                         BuildInitializerThunk(reduce));
     std::vector<std::unique_ptr<Thunk>> thunks;
@@ -2715,7 +2715,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
         uint8 b = literal_bytes.front();
         pattern16 = uint16{b} | (uint16{b} << 8);
       } else {
-        pattern16 = literal_bytes.front();
+        memcpy(&pattern16, literal_bytes.data(), sizeof(pattern16));
       }
       uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
       return {MakeUnique<Memset32BitValueThunk>(
author	Benjamin Kramer <kramerb@google.com>	2018-06-26 07:31:50 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-06-26 07:34:46 -0700
commit	5f2c44f6bb5507b84c8a11e9f614c2770f620032 (patch)
tree	ad4b28bb79ebd3fee39f019fe67ad91e50c014e1 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parent	c5feedabcebe67ea6c72402832ef9fd25c560446 (diff)