aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Adrian Kuegel <akuegel@google.com>2018-07-31 05:37:41 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-31 05:42:10 -0700
commit2826d123a017bc5f1a2cc7b969e275c5a63c326c (patch)
tree8204b3e1a0103a7a183a28e1976fc68491a8634b
parent3bec2640dcbd251f4eb2517d9ae7d8909886375d (diff)
Allow Sort to share the buffer with the operand if it is the only user.
The BitonicSort algorithm works in-place, so we can make use of that. On GPU, so far we copied the operand to the output and then performed the algorithm in-place. Now, we may not need to do this anymore if we see that the buffer is shared. Also, we now only need device-to-device copies in case the buffer is not shared, because constants are now also assigned a buffer. PiperOrigin-RevId: 206745686
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc43
-rw-r--r--tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc15
-rw-r--r--tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc42
-rw-r--r--tensorflow/compiler/xla/service/tuple_points_to_analysis.cc16
-rw-r--r--tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc42
5 files changed, 129 insertions, 29 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 3a5394dac6..874c7cfb8a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2068,6 +2068,7 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
std::vector<std::unique_ptr<Thunk>> thunks;
+ auto keys = sort->operand(0);
auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
ShapeIndex keys_shape_index({});
ShapeIndex values_shape_index({});
@@ -2076,41 +2077,25 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
values_shape_index = ShapeIndex({1});
}
auto keys_destination = GetAllocationSlice(*sort, keys_shape_index);
+ auto values_destination = GetAllocationSlice(*sort, values_shape_index);
- // First copy the operand(s) to the output, so that we can sort in-place.
- // TODO(b/26783907): Share buffer of output and operand when it is possible.
- if (sort->operand(0)->IsConstant()) {
- thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
- /*source_address=*/sort->operand(0)->literal().untyped_data(),
- /*destination_buffer=*/keys_destination,
- /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
- nullptr));
- } else {
+ if (keys_destination != GetAllocationSlice(*keys)) {
thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
- /*source_address=*/GetAllocationSlice(*sort->operand(0)),
+ /*source_address=*/GetAllocationSlice(*keys),
/*destination_buffer=*/keys_destination,
- /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
- nullptr));
+ /*mem_size=*/ShapeUtil::ByteSizeOf(keys->shape()), nullptr));
}
- if (values != nullptr) {
- if (values->IsConstant()) {
- thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
- /*source_address=*/sort->operand(1)->literal().untyped_data(),
- /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
- /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
- nullptr));
- } else {
- thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
- /*source_address=*/GetAllocationSlice(*sort->operand(1)),
- /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
- /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
- nullptr));
- }
+ if (values != nullptr && values_destination != GetAllocationSlice(*values)) {
+ // TODO(b/26783907): Figure out why we never seem to share buffers for
+ // key/value sort.
+ thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+ /*source_address=*/GetAllocationSlice(*values),
+ /*destination_buffer=*/values_destination,
+ /*mem_size=*/ShapeUtil::ByteSizeOf(values->shape()), nullptr));
}
int64 dimension_to_sort = sort->dimensions(0);
- int64 dimension_to_sort_bound =
- sort->operand(0)->shape().dimensions(dimension_to_sort);
+ int64 dimension_to_sort_bound = keys->shape().dimensions(dimension_to_sort);
int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
auto index_type = b_.getInt64Ty();
@@ -2134,7 +2119,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
thunks.push_back(
BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
- sort->operand(0)->shape(), ir_emitter_context_->device_description());
+ keys->shape(), ir_emitter_context_->device_description());
UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
ir_emitter_context_->llvm_module());
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 1abfcb7703..bbfb0c253f 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1084,6 +1084,21 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
std::vector<int64> operand_indices = user->OperandIndices(operand);
return operand_indices.size() == 1 && operand_indices[0] == 0;
}
+ if (user->opcode() == HloOpcode::kSort) {
+ // Only valid if there are no other users.
+ if (operand->users().size() != 1) {
+ return false;
+ }
+ // If we only sort keys, the output of sort is not a tuple, so we can always
+ // share the buffer.
+ if (user->operand_count() == 1) {
+ return true;
+ }
+ CHECK(!user_index.empty());
+ // Only share with the right tuple element buffer.
+ std::vector<int64> operand_indices = user->OperandIndices(operand);
+ return operand_indices.size() == 1 && user_index[0] == operand_indices[0];
+ }
if (user->opcode() == HloOpcode::kCall) {
// Get all uses of value defined by 'operand' at 'operand_index'.
const auto& uses = GetValueDefinedAt(operand, operand_index).uses();
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 37bc2d2c9d..2ec31a9148 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -2232,6 +2232,48 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
}
+TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
+ auto builder = HloComputation::Builder(TestName());
+
+ Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
+ auto keys = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, keys_shape, "keys"));
+ auto sort =
+ builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+
+ BuildModuleAndRunAnalysis(builder.Build());
+
+ EXPECT_TRUE(
+ dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
+ auto builder = HloComputation::Builder(TestName());
+
+ Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
+ Shape values_shape = ShapeUtil::MakeShape(F32, {8});
+ auto keys = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, keys_shape, "keys"));
+ auto values = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, values_shape, "values"));
+ auto sort = builder.AddInstruction(HloInstruction::CreateSort(
+ ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values));
+
+ BuildModuleAndRunAnalysis(builder.Build());
+
+ // The buffer for the keys can be shared with the first tuple entry.
+ EXPECT_TRUE(
+ dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {0}));
+ // The buffer for the values can be shared with the second tuple entry.
+ EXPECT_TRUE(
+ dataflow_analysis_->CanShareOperandBufferWithUser(values, {}, sort, {1}));
+ // Verify that the buffers are not shared with the "wrong" tuple entry.
+ EXPECT_FALSE(
+ dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {1}));
+ EXPECT_FALSE(
+ dataflow_analysis_->CanShareOperandBufferWithUser(values, {}, sort, {0}));
+}
+
TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
auto builder = HloComputation::Builder(TestName());
Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 990dfc410c..0effdc80a4 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -718,6 +718,7 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
// root at operand 0 or 1. Or...
// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
// 0.
+// (5) The 'user' of 'operand' is Sort, and it is the only user.
//
// (2) and (3) can only be determined if points-to analysis is available.
bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
@@ -783,6 +784,21 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
std::vector<int64> operand_indices = user->OperandIndices(operand);
return operand_indices.size() == 1 && operand_indices[0] == 0;
}
+ if (user->opcode() == HloOpcode::kSort) {
+ // Only valid if there are no other users.
+ if (operand->users().size() != 1) {
+ return false;
+ }
+ // If we only sort keys, the output of sort is not a tuple, so we can always
+ // share the buffer.
+ if (user->operand_count() == 1) {
+ return true;
+ }
+ CHECK(!user_index.empty());
+ // Only share with the right tuple element buffer.
+ std::vector<int64> operand_indices = user->OperandIndices(operand);
+ return operand_indices.size() == 1 && user_index[0] == operand_indices[0];
+ }
if (user->opcode() == HloOpcode::kCall) {
// TODO(b/62548313): Remove when buffer assignment is module scoped and
// does not assign buffers to calls.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 0ac8df4271..2e5f646804 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -1012,6 +1012,48 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
points_to_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
}
+TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
+ auto builder = HloComputation::Builder(TestName());
+
+ Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
+ auto keys = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, keys_shape, "keys"));
+ auto sort =
+ builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+
+ BuildModuleAndRunAnalysis(builder.Build());
+
+ EXPECT_TRUE(
+ points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
+ auto builder = HloComputation::Builder(TestName());
+
+ Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
+ Shape values_shape = ShapeUtil::MakeShape(F32, {8});
+ auto keys = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, keys_shape, "keys"));
+ auto values = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, values_shape, "values"));
+ auto sort = builder.AddInstruction(HloInstruction::CreateSort(
+ ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values));
+
+ BuildModuleAndRunAnalysis(builder.Build());
+
+ // The buffer for the keys can be shared with the first tuple entry.
+ EXPECT_TRUE(
+ points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {0}));
+ // The buffer for the values can be shared with the second tuple entry.
+ EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(values, {},
+ sort, {1}));
+ // Verify that the buffers are not shared with the "wrong" tuple entry.
+ EXPECT_FALSE(
+ points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {1}));
+ EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(values, {},
+ sort, {0}));
+}
+
TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
auto builder = HloComputation::Builder(TestName());
Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});