Move Bitonic sort emitter logic into a helper file.

This will allow to reuse it for the CPU backend. Also move the method EmitOperandArrayLoopNest to the LoopNest class. This makes more sense than having it as part of IrEmitter. PiperOrigin-RevId: 205200030
author: Adrian Kuegel <akuegel@google.com> 2018-07-19 00:58:11 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-07-19 01:01:53 -0700
commit: 6e94fd7b5eebc506202e1ae3b6c0de73e3b727bf (patch)
tree: 5cc9c1124bdd08fc44caf5b1bb28400d20951b4b /tensorflow/compiler/xla/service/llvm_ir
parent: a7c7700110eb27ca534c66699c3bbfa170c36fb2 (diff)
5 files changed, 290 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index c14a5bfb53..462be543bc 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -181,6 +181,20 @@ cc_library(
 )
 
 cc_library(
+    name = "sort_util",
+    srcs = ["sort_util.cc"],
+    hdrs = ["sort_util.h"],
+    deps = [
+        ":ir_array",
+        ":llvm_loop",
+        ":llvm_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
     name = "tuple_ops",
     srcs = ["tuple_ops.cc"],
     hdrs = ["tuple_ops.h"],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index c9ae7d3afd..1227534779 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -262,5 +262,35 @@ IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
   return index;
 }
 
+IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
+    const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
+    tensorflow::StringPiece name_suffix) {
+  // Prepares the dimension list we will use to emit the loop nest. Outermost
+  // loops are added first. Add loops in major-to-minor order, and skip the
+  // 'dimension_to_skip' dimension.
+  std::vector<int64> dimensions;
+  const Shape& shape = operand_array.GetShape();
+  for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
+    if (dimension != dimension_to_skip) {
+      dimensions.push_back(dimension);
+    }
+  }
+
+  // Create loop nest with one for-loop for each dimension of the
+  // output.
+  llvm_ir::IrArray::Index index =
+      AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
+  // Verify every dimension except the 'dimension_to_skip' dimension was set in
+  // the index.
+  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
+    if (dimension == dimension_to_skip) {
+      DCHECK_EQ(nullptr, index[dimension]);
+    } else {
+      DCHECK_NE(nullptr, index[dimension]);
+    }
+  }
+  return index;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 0dd5b9d3b2..b3266022db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -248,6 +248,17 @@ class ForLoopNest {
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions,
       tensorflow::StringPiece suffix);
 
+  // Emits a series of nested loops for iterating over an operand array. Loops
+  // are constructed in major to minor dimension layout order. No loop is
+  // emitted for the given 'dimension_to_skip'. The function returns an IrArray
+  // index for the given operand_array containing the indvars of the loops. All
+  // dimensions of the index are filled except for 'dimension_to_skip'.
+  // name_suffix is the string to append to the names of LLVM constructs (eg,
+  // basic blocks) constructed by this method.
+  IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array,
+                                          int64 dimension_to_skip,
+                                          tensorflow::StringPiece name_suffix);
+
   // Convenience methods which return particular basic blocks of the outermost
   // or innermost loops. These methods return nullptr if no loops have been
   // added yet.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
new file mode 100644
index 0000000000..16a9a5aaeb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
+
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+
+namespace {
+// Adds the inner comparison loop where we compare elements pointed to by
+// 'keys_index' and 'compare_keys_index'.
+void EmitCompareLoop(int64 dimension_to_sort,
+                     const llvm_ir::IrArray::Index& keys_index,
+                     const llvm_ir::IrArray::Index& compare_keys_index,
+                     const llvm_ir::IrArray& keys_array,
+                     llvm::IRBuilder<>* ir_builder) {
+  // TODO(b/26783907): parallelize this loop.
+
+  // if (is_smaller_index &&
+  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
+  llvm::Value* is_smaller_index = ir_builder->CreateICmpSLT(
+      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  auto if_data = llvm_ir::EmitIfThenElse(
+      ir_builder->CreateAnd(
+          is_smaller_index,
+          ir_builder->CreateICmpSLT(
+              compare_keys_index[dimension_to_sort],
+              keys_index.GetConstantWithIndexType(dimension_to_sort_bound))),
+      "smaller_comparison_index", ir_builder, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_data.true_block, ir_builder);
+  auto key1 = keys_array.EmitReadArrayElement(keys_index, ir_builder);
+  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, ir_builder);
+  auto key_type = keys_array.GetShape().element_type();
+  auto comparison =
+      primitive_util::IsFloatingPointType(key_type)
+          // TODO(b/26783907): Figure out how to handle NaNs.
+          ? ir_builder->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key1, key2)
+          : ir_builder->CreateICmp(
+                primitive_util::IsSignedIntegralType(key_type)
+                    ? llvm::ICmpInst::ICMP_SLT
+                    : llvm::ICmpInst::ICMP_ULT,
+                key1, key2);
+  auto min_key = ir_builder->CreateSelect(comparison, key1, key2);
+  auto max_key = ir_builder->CreateSelect(comparison, key2, key1);
+  keys_array.EmitWriteArrayElement(keys_index, min_key, ir_builder);
+  keys_array.EmitWriteArrayElement(compare_keys_index, max_key, ir_builder);
+}
+}  // namespace
+
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       tensorflow::StringPiece name,
+                       llvm::IRBuilder<>* ir_builder) {
+  const Shape& keys_shape = keys_array.GetShape();
+
+  // TODO(b/26783907): This case can probably be avoided with the Algebraic
+  // Simplifier.
+  if (ShapeUtil::IsScalar(keys_shape)) {
+    return Status::OK();
+  }
+
+  // Create loop nests which loop through the operand dimensions. The sort
+  // dimension is handled in three separate innermost loops which perform the
+  // sorting.
+  ForLoopNest loop_nest(name, ir_builder);
+  IrArray::Index keys_index =
+      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
+
+  // 'compare_keys_index' is the index of the element that 'keys_index' should
+  // be compared to.
+  IrArray::Index compare_keys_index(keys_index.GetType());
+  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
+    if (dimension != dimension_to_sort) {
+      compare_keys_index.push_back(keys_index[dimension]);
+    } else {
+      compare_keys_index.push_back(nullptr);
+    }
+  }
+
+  // Create the sorting loops which do the sorting.
+  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  std::unique_ptr<ForLoop> stages_loop = loop_nest.AddLoop(
+      /*start_index=*/0,
+      /*end_index=*/
+      tensorflow::Log2Ceiling64(dimension_to_sort_bound),
+      /*suffix=*/"sort_stages");
+  std::unique_ptr<ForLoop> mask_loop = loop_nest.AddLoop(
+      /*suffix=*/"mask",
+      /*start_index=*/keys_index.GetConstantWithIndexType(0),
+      /*end_index=*/stages_loop->GetIndVarValue());
+  std::unique_ptr<ForLoop> compare_loop = loop_nest.AddLoop(
+      /*start_index=*/0,
+      /*end_index=*/dimension_to_sort_bound,
+      /*suffix=*/"compare");
+
+  // Naive C++ code for the inner loops (without parallelization):
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //     int64 j = i ^ first_xor_mask;
+  //     if (i < j && j < dimension_to_sort_bound) {
+  //       int64 min_key = std::min(keys[i], keys[j]);
+  //       keys[j] = std::max(keys[i], keys[j]);
+  //       keys[i] = min_key;
+  //     }
+  //   }
+  //   for (int64 mask = 0; mask < stage; ++mask) {
+  //     int64 later_xor_mask = (1LL << (stage - (mask + 1));
+  //     for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //       int64 j = i ^ later_xor_mask;
+  //       if (i < j && j < dimension_to_sort_bound) {
+  //         int64 min_key = std::min(keys[i], keys[j]);
+  //         keys[j] = std::max(keys[i], keys[j]);
+  //         keys[i] = min_key;
+  //       }
+  //     }
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), ir_builder);
+  // The first xor mask of a stage is 2^(stage + 1) - 1.
+  auto first_xor_mask = ir_builder->CreateSub(
+      ir_builder->CreateShl(
+          keys_index.GetConstantWithIndexType(1),
+          ir_builder->CreateAdd(stages_loop->GetIndVarValue(),
+                                keys_index.GetConstantWithIndexType(1))),
+      keys_index.GetConstantWithIndexType(1));
+  std::unique_ptr<ForLoop> first_compare_loop = ForLoop::EmitForLoop(
+      /*prefix=*/"first_compare",
+      /*start_index=*/keys_index.GetConstantWithIndexType(0),
+      /*end_index=*/
+      keys_index.GetConstantWithIndexType(dimension_to_sort_bound),
+      /*step=*/keys_index.GetConstantWithIndexType(1),
+      /*ir_builder=*/ir_builder);
+
+  SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), ir_builder);
+  // 'first_compare_loop' iterates through the 'dimension_to_sort'.
+  keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue();
+  compare_keys_index[dimension_to_sort] = ir_builder->CreateXor(
+      first_compare_loop->GetIndVarValue(), first_xor_mask);
+  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
+                  ir_builder);
+
+  SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), ir_builder);
+  // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)).
+  auto later_xor_mask = ir_builder->CreateShl(
+      keys_index.GetConstantWithIndexType(1),
+      ir_builder->CreateSub(
+          stages_loop->GetIndVarValue(),
+          ir_builder->CreateAdd(mask_loop->GetIndVarValue(),
+                                keys_index.GetConstantWithIndexType(1))));
+
+  SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), ir_builder);
+  // 'compare_loop' iterates through the 'dimension_to_sort'.
+  keys_index[dimension_to_sort] = compare_loop->GetIndVarValue();
+  compare_keys_index[dimension_to_sort] =
+      ir_builder->CreateXor(compare_loop->GetIndVarValue(), later_xor_mask);
+  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
+                  ir_builder);
+
+  // Set the IR builder insert point to the exit basic block of the outer most
+  // loop. This ensures later instructions are inserted after this loop nest.
+  ir_builder->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+
+  return Status::OK();
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
new file mode 100644
index 0000000000..fc45bfab12
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+// Emits llvm IR to sort the 'dimension_to_sort' dimension of 'keys_array' into
+// ascending order.
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       tensorflow::StringPiece name,
+                       llvm::IRBuilder<>* ir_builder);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
author	Adrian Kuegel <akuegel@google.com>	2018-07-19 00:58:11 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-07-19 01:01:53 -0700
commit	6e94fd7b5eebc506202e1ae3b6c0de73e3b727bf (patch)
tree	5cc9c1124bdd08fc44caf5b1bb28400d20951b4b /tensorflow/compiler/xla/service/llvm_ir
parent	a7c7700110eb27ca534c66699c3bbfa170c36fb2 (diff)