Parallelize BitonicSort on GPU.

We now emit O(log^n) kernel thunks. Each thunk is responsible for looping over the other dimensions, and then doing a comparison loop through the dimension that should be sorted. PiperOrigin-RevId: 205791397
author: Adrian Kuegel <akuegel@google.com> 2018-07-24 01:13:09 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-07-24 01:16:32 -0700
commit: 33035bb79b6ecb408ef83cee5fc3e52ce058f39f (patch)
tree: 65dcfcad4f6461acbe91d98e71cfea06285b9890 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parent: fca1561b9d5932f940cf89e03128cf197547bed2 (diff)
1 files changed, 45 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 64a6baf66d..b1038a3cc9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -71,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
@@ -2036,11 +2038,51 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
         /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
   }
 
-  thunks.push_back(
-      BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+  int64 dimension_to_sort = sort->dimensions(0);
+  int64 dimension_to_sort_bound = sort->shape().dimensions(dimension_to_sort);
+  int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  auto index_type = b_.getInt64Ty();
+
+  // Naive C++ code for the outer loops:
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   SortInPlace(first_xor_mask);
+  //   for (int64 mask = stage - 1; mask >= 0; --mask) {
+  //     int64 later_xor_mask = 1LL << mask;
+  //     SortInPlace(later_xor_mask);
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  for (int64 stage = 0; stage < num_stages; ++stage) {
+    for (int64 mask = stage; mask >= 0; --mask) {
+      thunks.push_back(
+          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+          sort->shape(), ir_emitter_context_->device_description());
+      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                             ir_emitter_context_->llvm_module());
+
+      llvm::Value* xor_mask;
+      if (mask == stage) {
+        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+      } else {
+        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+      }
+
+      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
+          dimension_to_sort, GetIrArray(*sort, *sort), IrName(sort), xor_mask,
+          &b_, &launch_dimensions));
+    }
+  }
+
   thunk_sequence_->emplace_back(
       MakeUnique<SequentialThunk>(std::move(thunks), sort));
-  return IrEmitter::HandleSort(sort);
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
author	Adrian Kuegel <akuegel@google.com>	2018-07-24 01:13:09 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-07-24 01:16:32 -0700
commit	33035bb79b6ecb408ef83cee5fc3e52ce058f39f (patch)
tree	65dcfcad4f6461acbe91d98e71cfea06285b9890 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parent	fca1561b9d5932f940cf89e03128cf197547bed2 (diff)