diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/llvm_ir')
-rw-r--r-- | tensorflow/compiler/xla/service/llvm_ir/BUILD | 14 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc | 30 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h | 11 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/llvm_ir/sort_util.cc | 201 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/llvm_ir/sort_util.h | 34 |
5 files changed, 290 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index c14a5bfb53..462be543bc 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -181,6 +181,20 @@ cc_library( ) cc_library( + name = "sort_util", + srcs = ["sort_util.cc"], + hdrs = ["sort_util.h"], + deps = [ + ":ir_array", + ":llvm_loop", + ":llvm_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/core:lib", + "@llvm//:core", + ], +) + +cc_library( name = "tuple_ops", srcs = ["tuple_ops.cc"], hdrs = ["tuple_ops.h"], diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc index c9ae7d3afd..1227534779 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc @@ -262,5 +262,35 @@ IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions( return index; } +IrArray::Index ForLoopNest::EmitOperandArrayLoopNest( + const llvm_ir::IrArray& operand_array, int64 dimension_to_skip, + tensorflow::StringPiece name_suffix) { + // Prepares the dimension list we will use to emit the loop nest. Outermost + // loops are added first. Add loops in major-to-minor order, and skip the + // 'dimension_to_skip' dimension. + std::vector<int64> dimensions; + const Shape& shape = operand_array.GetShape(); + for (int64 dimension : LayoutUtil::MinorToMajor(shape)) { + if (dimension != dimension_to_skip) { + dimensions.push_back(dimension); + } + } + + // Create loop nest with one for-loop for each dimension of the + // output. + llvm_ir::IrArray::Index index = + AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix); + // Verify every dimension except the 'dimension_to_skip' dimension was set in + // the index. + for (size_t dimension = 0; dimension < index.size(); ++dimension) { + if (dimension == dimension_to_skip) { + DCHECK_EQ(nullptr, index[dimension]); + } else { + DCHECK_NE(nullptr, index[dimension]); + } + } + return index; +} + } // namespace llvm_ir } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h index 0dd5b9d3b2..b3266022db 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h @@ -248,6 +248,17 @@ class ForLoopNest { const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions, tensorflow::StringPiece suffix); + // Emits a series of nested loops for iterating over an operand array. Loops + // are constructed in major to minor dimension layout order. No loop is + // emitted for the given 'dimension_to_skip'. The function returns an IrArray + // index for the given operand_array containing the indvars of the loops. All + // dimensions of the index are filled except for 'dimension_to_skip'. + // name_suffix is the string to append to the names of LLVM constructs (eg, + // basic blocks) constructed by this method. + IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array, + int64 dimension_to_skip, + tensorflow::StringPiece name_suffix); + // Convenience methods which return particular basic blocks of the outermost // or innermost loops. These methods return nullptr if no loops have been // added yet. diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc new file mode 100644 index 0000000000..16a9a5aaeb --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc @@ -0,0 +1,201 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h" + +// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace llvm_ir { + +namespace { +// Adds the inner comparison loop where we compare elements pointed to by +// 'keys_index' and 'compare_keys_index'. +void EmitCompareLoop(int64 dimension_to_sort, + const llvm_ir::IrArray::Index& keys_index, + const llvm_ir::IrArray::Index& compare_keys_index, + const llvm_ir::IrArray& keys_array, + llvm::IRBuilder<>* ir_builder) { + // TODO(b/26783907): parallelize this loop. + + // if (is_smaller_index && + // compare_keys[dimension_to_sort] < dimension_to_sort_bound) + llvm::Value* is_smaller_index = ir_builder->CreateICmpSLT( + keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]); + int64 dimension_to_sort_bound = + keys_array.GetShape().dimensions(dimension_to_sort); + auto if_data = llvm_ir::EmitIfThenElse( + ir_builder->CreateAnd( + is_smaller_index, + ir_builder->CreateICmpSLT( + compare_keys_index[dimension_to_sort], + keys_index.GetConstantWithIndexType(dimension_to_sort_bound))), + "smaller_comparison_index", ir_builder, /*emit_else=*/false); + SetToFirstInsertPoint(if_data.true_block, ir_builder); + auto key1 = keys_array.EmitReadArrayElement(keys_index, ir_builder); + auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, ir_builder); + auto key_type = keys_array.GetShape().element_type(); + auto comparison = + primitive_util::IsFloatingPointType(key_type) + // TODO(b/26783907): Figure out how to handle NaNs. + ? ir_builder->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key1, key2) + : ir_builder->CreateICmp( + primitive_util::IsSignedIntegralType(key_type) + ? llvm::ICmpInst::ICMP_SLT + : llvm::ICmpInst::ICMP_ULT, + key1, key2); + auto min_key = ir_builder->CreateSelect(comparison, key1, key2); + auto max_key = ir_builder->CreateSelect(comparison, key2, key1); + keys_array.EmitWriteArrayElement(keys_index, min_key, ir_builder); + keys_array.EmitWriteArrayElement(compare_keys_index, max_key, ir_builder); +} +} // namespace + +Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, + tensorflow::StringPiece name, + llvm::IRBuilder<>* ir_builder) { + const Shape& keys_shape = keys_array.GetShape(); + + // TODO(b/26783907): This case can probably be avoided with the Algebraic + // Simplifier. + if (ShapeUtil::IsScalar(keys_shape)) { + return Status::OK(); + } + + // Create loop nests which loop through the operand dimensions. The sort + // dimension is handled in three separate innermost loops which perform the + // sorting. + ForLoopNest loop_nest(name, ir_builder); + IrArray::Index keys_index = + loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys"); + + // 'compare_keys_index' is the index of the element that 'keys_index' should + // be compared to. + IrArray::Index compare_keys_index(keys_index.GetType()); + for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) { + if (dimension != dimension_to_sort) { + compare_keys_index.push_back(keys_index[dimension]); + } else { + compare_keys_index.push_back(nullptr); + } + } + + // Create the sorting loops which do the sorting. + int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort); + std::unique_ptr<ForLoop> stages_loop = loop_nest.AddLoop( + /*start_index=*/0, + /*end_index=*/ + tensorflow::Log2Ceiling64(dimension_to_sort_bound), + /*suffix=*/"sort_stages"); + std::unique_ptr<ForLoop> mask_loop = loop_nest.AddLoop( + /*suffix=*/"mask", + /*start_index=*/keys_index.GetConstantWithIndexType(0), + /*end_index=*/stages_loop->GetIndVarValue()); + std::unique_ptr<ForLoop> compare_loop = loop_nest.AddLoop( + /*start_index=*/0, + /*end_index=*/dimension_to_sort_bound, + /*suffix=*/"compare"); + + // Naive C++ code for the inner loops (without parallelization): + // + // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound); + // ++stage) { + // int64 first_xor_mask = (1LL << (stage + 1)) - 1; + // for (int64 i = 0; i < dimension_to_sort_bound; ++i) { + // int64 j = i ^ first_xor_mask; + // if (i < j && j < dimension_to_sort_bound) { + // int64 min_key = std::min(keys[i], keys[j]); + // keys[j] = std::max(keys[i], keys[j]); + // keys[i] = min_key; + // } + // } + // for (int64 mask = 0; mask < stage; ++mask) { + // int64 later_xor_mask = (1LL << (stage - (mask + 1)); + // for (int64 i = 0; i < dimension_to_sort_bound; ++i) { + // int64 j = i ^ later_xor_mask; + // if (i < j && j < dimension_to_sort_bound) { + // int64 min_key = std::min(keys[i], keys[j]); + // keys[j] = std::max(keys[i], keys[j]); + // keys[i] = min_key; + // } + // } + // } + // } + // + // This follows the algorithm described on Wikipedia: + // https://en.wikipedia.org/wiki/Bitonic_sorter + + SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), ir_builder); + // The first xor mask of a stage is 2^(stage + 1) - 1. + auto first_xor_mask = ir_builder->CreateSub( + ir_builder->CreateShl( + keys_index.GetConstantWithIndexType(1), + ir_builder->CreateAdd(stages_loop->GetIndVarValue(), + keys_index.GetConstantWithIndexType(1))), + keys_index.GetConstantWithIndexType(1)); + std::unique_ptr<ForLoop> first_compare_loop = ForLoop::EmitForLoop( + /*prefix=*/"first_compare", + /*start_index=*/keys_index.GetConstantWithIndexType(0), + /*end_index=*/ + keys_index.GetConstantWithIndexType(dimension_to_sort_bound), + /*step=*/keys_index.GetConstantWithIndexType(1), + /*ir_builder=*/ir_builder); + + SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), ir_builder); + // 'first_compare_loop' iterates through the 'dimension_to_sort'. + keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue(); + compare_keys_index[dimension_to_sort] = ir_builder->CreateXor( + first_compare_loop->GetIndVarValue(), first_xor_mask); + EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array, + ir_builder); + + SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), ir_builder); + // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)). + auto later_xor_mask = ir_builder->CreateShl( + keys_index.GetConstantWithIndexType(1), + ir_builder->CreateSub( + stages_loop->GetIndVarValue(), + ir_builder->CreateAdd(mask_loop->GetIndVarValue(), + keys_index.GetConstantWithIndexType(1)))); + + SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), ir_builder); + // 'compare_loop' iterates through the 'dimension_to_sort'. + keys_index[dimension_to_sort] = compare_loop->GetIndVarValue(); + compare_keys_index[dimension_to_sort] = + ir_builder->CreateXor(compare_loop->GetIndVarValue(), later_xor_mask); + EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array, + ir_builder); + + // Set the IR builder insert point to the exit basic block of the outer most + // loop. This ensures later instructions are inserted after this loop nest. + ir_builder->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); + + return Status::OK(); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h new file mode 100644 index 0000000000..fc45bfab12 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h @@ -0,0 +1,34 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ + +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace llvm_ir { +// Emits llvm IR to sort the 'dimension_to_sort' dimension of 'keys_array' into +// ascending order. +Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, + tensorflow::StringPiece name, + llvm::IRBuilder<>* ir_builder); +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ |