aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
diff options
context:
space:
mode:
authorGravatar Adrian Kuegel <akuegel@google.com>2018-07-24 01:13:09 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-24 01:16:32 -0700
commit33035bb79b6ecb408ef83cee5fc3e52ce058f39f (patch)
tree65dcfcad4f6461acbe91d98e71cfea06285b9890 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
parentfca1561b9d5932f940cf89e03128cf197547bed2 (diff)
Parallelize BitonicSort on GPU.
We now emit O(log^n) kernel thunks. Each thunk is responsible for looping over the other dimensions, and then doing a comparison loop through the dimension that should be sorted. PiperOrigin-RevId: 205791397
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc48
1 files changed, 45 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 64a6baf66d..b1038a3cc9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -63,6 +63,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
#include "tensorflow/compiler/xla/service/name_uniquer.h"
#include "tensorflow/compiler/xla/shape_util.h"
@@ -71,6 +72,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/compiler/xla/window_util.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/platform/logging.h"
@@ -2036,11 +2038,51 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
/*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
}
- thunks.push_back(
- BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+ int64 dimension_to_sort = sort->dimensions(0);
+ int64 dimension_to_sort_bound = sort->shape().dimensions(dimension_to_sort);
+ int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+ auto index_type = b_.getInt64Ty();
+
+ // Naive C++ code for the outer loops:
+ //
+ // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+ // ++stage) {
+ // int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+ // SortInPlace(first_xor_mask);
+ // for (int64 mask = stage - 1; mask >= 0; --mask) {
+ // int64 later_xor_mask = 1LL << mask;
+ // SortInPlace(later_xor_mask);
+ // }
+ // }
+ //
+ // This follows the algorithm described on Wikipedia:
+ // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+ for (int64 stage = 0; stage < num_stages; ++stage) {
+ for (int64 mask = stage; mask >= 0; --mask) {
+ thunks.push_back(
+ BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+ LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+ sort->shape(), ir_emitter_context_->device_description());
+ UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+ ir_emitter_context_->llvm_module());
+
+ llvm::Value* xor_mask;
+ if (mask == stage) {
+ xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+ } else {
+ xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+ }
+
+ TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
+ dimension_to_sort, GetIrArray(*sort, *sort), IrName(sort), xor_mask,
+ &b_, &launch_dimensions));
+ }
+ }
+
thunk_sequence_->emplace_back(
MakeUnique<SequentialThunk>(std::move(thunks), sort));
- return IrEmitter::HandleSort(sort);
+ return Status::OK();
}
Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {