diff options
author | Adrian Kuegel <akuegel@google.com> | 2018-07-24 01:13:09 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-07-24 01:16:32 -0700 |
commit | 33035bb79b6ecb408ef83cee5fc3e52ce058f39f (patch) | |
tree | 65dcfcad4f6461acbe91d98e71cfea06285b9890 /tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | |
parent | fca1561b9d5932f940cf89e03128cf197547bed2 (diff) |
Parallelize BitonicSort on GPU.
We now emit O(log^n) kernel thunks. Each thunk is responsible for looping over
the other dimensions, and then doing a comparison loop through the dimension
that should be sorted.
PiperOrigin-RevId: 205791397
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 48 |
1 files changed, 45 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 64a6baf66d..b1038a3cc9 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -63,6 +63,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h" #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -71,6 +72,7 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" @@ -2036,11 +2038,51 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort)); } - thunks.push_back( - BuildKernelThunk(sort, /*implements_whole_instruction=*/false)); + int64 dimension_to_sort = sort->dimensions(0); + int64 dimension_to_sort_bound = sort->shape().dimensions(dimension_to_sort); + int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound); + auto index_type = b_.getInt64Ty(); + + // Naive C++ code for the outer loops: + // + // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound); + // ++stage) { + // int64 first_xor_mask = (1LL << (stage + 1)) - 1; + // SortInPlace(first_xor_mask); + // for (int64 mask = stage - 1; mask >= 0; --mask) { + // int64 later_xor_mask = 1LL << mask; + // SortInPlace(later_xor_mask); + // } + // } + // + // This follows the algorithm described on Wikipedia: + // https://en.wikipedia.org/wiki/Bitonic_sorter + + for (int64 stage = 0; stage < num_stages; ++stage) { + for (int64 mask = stage; mask >= 0; --mask) { + thunks.push_back( + BuildKernelThunk(sort, /*implements_whole_instruction=*/false)); + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + sort->shape(), ir_emitter_context_->device_description()); + UpdateLaunchDimensions(launch_dimensions, thunks.back().get(), + ir_emitter_context_->llvm_module()); + + llvm::Value* xor_mask; + if (mask == stage) { + xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1); + } else { + xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask); + } + + TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace( + dimension_to_sort, GetIrArray(*sort, *sort), IrName(sort), xor_mask, + &b_, &launch_dimensions)); + } + } + thunk_sequence_->emplace_back( MakeUnique<SequentialThunk>(std::move(thunks), sort)); - return IrEmitter::HandleSort(sort); + return Status::OK(); } Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) { |