5 files changed, 290 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index c14a5bfb53..462be543bc 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -181,6 +181,20 @@ cc_library(
 )
 
 cc_library(
+    name = "sort_util",
+    srcs = ["sort_util.cc"],
+    hdrs = ["sort_util.h"],
+    deps = [
+        ":ir_array",
+        ":llvm_loop",
+        ":llvm_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
     name = "tuple_ops",
     srcs = ["tuple_ops.cc"],
     hdrs = ["tuple_ops.h"],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index c9ae7d3afd..1227534779 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -262,5 +262,35 @@ IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
   return index;
 }
 
+IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
+    const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
+    tensorflow::StringPiece name_suffix) {
+  // Prepares the dimension list we will use to emit the loop nest. Outermost
+  // loops are added first. Add loops in major-to-minor order, and skip the
+  // 'dimension_to_skip' dimension.
+  std::vector<int64> dimensions;
+  const Shape& shape = operand_array.GetShape();
+  for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
+    if (dimension != dimension_to_skip) {
+      dimensions.push_back(dimension);
+    }
+  }
+
+  // Create loop nest with one for-loop for each dimension of the
+  // output.
+  llvm_ir::IrArray::Index index =
+      AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
+  // Verify every dimension except the 'dimension_to_skip' dimension was set in
+  // the index.
+  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
+    if (dimension == dimension_to_skip) {
+      DCHECK_EQ(nullptr, index[dimension]);
+    } else {
+      DCHECK_NE(nullptr, index[dimension]);
+    }
+  }
+  return index;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 0dd5b9d3b2..b3266022db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -248,6 +248,17 @@ class ForLoopNest {
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions,
       tensorflow::StringPiece suffix);
 
+  // Emits a series of nested loops for iterating over an operand array. Loops
+  // are constructed in major to minor dimension layout order. No loop is
+  // emitted for the given 'dimension_to_skip'. The function returns an IrArray
+  // index for the given operand_array containing the indvars of the loops. All
+  // dimensions of the index are filled except for 'dimension_to_skip'.
+  // name_suffix is the string to append to the names of LLVM constructs (eg,
+  // basic blocks) constructed by this method.
+  IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array,
+                                          int64 dimension_to_skip,
+                                          tensorflow::StringPiece name_suffix);
+
   // Convenience methods which return particular basic blocks of the outermost
   // or innermost loops. These methods return nullptr if no loops have been
   // added yet.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
new file mode 100644
index 0000000000..16a9a5aaeb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
+
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+
+namespace {
+// Adds the inner comparison loop where we compare elements pointed to by
+// 'keys_index' and 'compare_keys_index'.
+void EmitCompareLoop(int64 dimension_to_sort,
+                     const llvm_ir::IrArray::Index& keys_index,
+                     const llvm_ir::IrArray::Index& compare_keys_index,
+                     const llvm_ir::IrArray& keys_array,
+                     llvm::IRBuilder<>* ir_builder) {
+  // TODO(b/26783907): parallelize this loop.
+
+  // if (is_smaller_index &&
+  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
+  llvm::Value* is_smaller_index = ir_builder->CreateICmpSLT(
+      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  auto if_data = llvm_ir::EmitIfThenElse(
+      ir_builder->CreateAnd(
+          is_smaller_index,
+          ir_builder->CreateICmpSLT(
+              compare_keys_index[dimension_to_sort],
+              keys_index.GetConstantWithIndexType(dimension_to_sort_bound))),
+      "smaller_comparison_index", ir_builder, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_data.true_block, ir_builder);
+  auto key1 = keys_array.EmitReadArrayElement(keys_index, ir_builder);
+  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, ir_builder);
+  auto key_type = keys_array.GetShape().element_type();
+  auto comparison =
+      primitive_util::IsFloatingPointType(key_type)
+          // TODO(b/26783907): Figure out how to handle NaNs.
+          ? ir_builder->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key1, key2)
+          : ir_builder->CreateICmp(
+                primitive_util::IsSignedIntegralType(key_type)
+                    ? llvm::ICmpInst::ICMP_SLT
+                    : llvm::ICmpInst::ICMP_ULT,
+                key1, key2);
+  auto min_key = ir_builder->CreateSelect(comparison, key1, key2);
+  auto max_key = ir_builder->CreateSelect(comparison, key2, key1);
+  keys_array.EmitWriteArrayElement(keys_index, min_key, ir_builder);
+  keys_array.EmitWriteArrayElement(compare_keys_index, max_key, ir_builder);
+}
+}  // namespace
+
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       tensorflow::StringPiece name,
+                       llvm::IRBuilder<>* ir_builder) {
+  const Shape& keys_shape = keys_array.GetShape();
+
+  // TODO(b/26783907): This case can probably be avoided with the Algebraic
+  // Simplifier.
+  if (ShapeUtil::IsScalar(keys_shape)) {
+    return Status::OK();
+  }
+
+  // Create loop nests which loop through the operand dimensions. The sort
+  // dimension is handled in three separate innermost loops which perform the
+  // sorting.
+  ForLoopNest loop_nest(name, ir_builder);
+  IrArray::Index keys_index =
+      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
+
+  // 'compare_keys_index' is the index of the element that 'keys_index' should
+  // be compared to.
+  IrArray::Index compare_keys_index(keys_index.GetType());
+  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
+    if (dimension != dimension_to_sort) {
+      compare_keys_index.push_back(keys_index[dimension]);
+    } else {
+      compare_keys_index.push_back(nullptr);
+    }
+  }
+
+  // Create the sorting loops which do the sorting.
+  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  std::unique_ptr<ForLoop> stages_loop = loop_nest.AddLoop(
+      /*start_index=*/0,
+      /*end_index=*/
+      tensorflow::Log2Ceiling64(dimension_to_sort_bound),
+      /*suffix=*/"sort_stages");
+  std::unique_ptr<ForLoop> mask_loop = loop_nest.AddLoop(
+      /*suffix=*/"mask",
+      /*start_index=*/keys_index.GetConstantWithIndexType(0),
+      /*end_index=*/stages_loop->GetIndVarValue());
+  std::unique_ptr<ForLoop> compare_loop = loop_nest.AddLoop(
+      /*start_index=*/0,
+      /*end_index=*/dimension_to_sort_bound,
+      /*suffix=*/"compare");
+
+  // Naive C++ code for the inner loops (without parallelization):
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //     int64 j = i ^ first_xor_mask;
+  //     if (i < j && j < dimension_to_sort_bound) {
+  //       int64 min_key = std::min(keys[i], keys[j]);
+  //       keys[j] = std::max(keys[i], keys[j]);
+  //       keys[i] = min_key;
+  //     }
+  //   }
+  //   for (int64 mask = 0; mask < stage; ++mask) {
+  //     int64 later_xor_mask = (1LL << (stage - (mask + 1));
+  //     for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //       int64 j = i ^ later_xor_mask;
+  //       if (i < j && j < dimension_to_sort_bound) {
+  //         int64 min_key = std::min(keys[i], keys[j]);
+  //         keys[j] = std::max(keys[i], keys[j]);
+  //         keys[i] = min_key;
+  //       }
+  //     }
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), ir_builder);
+  // The first xor mask of a stage is 2^(stage + 1) - 1.
+  auto first_xor_mask = ir_builder->CreateSub(
+      ir_builder->CreateShl(
+          keys_index.GetConstantWithIndexType(1),
+          ir_builder->CreateAdd(stages_loop->GetIndVarValue(),
+                                keys_index.GetConstantWithIndexType(1))),
+      keys_index.GetConstantWithIndexType(1));
+  std::unique_ptr<ForLoop> first_compare_loop = ForLoop::EmitForLoop(
+      /*prefix=*/"first_compare",
+      /*start_index=*/keys_index.GetConstantWithIndexType(0),
+      /*end_index=*/
+      keys_index.GetConstantWithIndexType(dimension_to_sort_bound),
+      /*step=*/keys_index.GetConstantWithIndexType(1),
+      /*ir_builder=*/ir_builder);
+
+  SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), ir_builder);
+  // 'first_compare_loop' iterates through the 'dimension_to_sort'.
+  keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue();
+  compare_keys_index[dimension_to_sort] = ir_builder->CreateXor(
+      first_compare_loop->GetIndVarValue(), first_xor_mask);
+  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
+                  ir_builder);
+
+  SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), ir_builder);
+  // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)).
+  auto later_xor_mask = ir_builder->CreateShl(
+      keys_index.GetConstantWithIndexType(1),
+      ir_builder->CreateSub(
+          stages_loop->GetIndVarValue(),
+          ir_builder->CreateAdd(mask_loop->GetIndVarValue(),
+                                keys_index.GetConstantWithIndexType(1))));
+
+  SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), ir_builder);
+  // 'compare_loop' iterates through the 'dimension_to_sort'.
+  keys_index[dimension_to_sort] = compare_loop->GetIndVarValue();
+  compare_keys_index[dimension_to_sort] =
+      ir_builder->CreateXor(compare_loop->GetIndVarValue(), later_xor_mask);
+  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
+                  ir_builder);
+
+  // Set the IR builder insert point to the exit basic block of the outer most
+  // loop. This ensures later instructions are inserted after this loop nest.
+  ir_builder->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+
+  return Status::OK();
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
new file mode 100644
index 0000000000..fc45bfab12
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+// Emits llvm IR to sort the 'dimension_to_sort' dimension of 'keys_array' into
+// ascending order.
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       tensorflow::StringPiece name,
+                       llvm::IRBuilder<>* ir_builder);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_