[XLA:CPU] Allow the shape partition algorithm to partition the most minor

dimension. The current shape paritition algorithm does not partition the most minor dimension, because doing so causes dynamic loop bounds for the inner loop and used to prohibit LLVM vectorization. This constraint has been removed with revision 328478 and LLVM can now vectorize loops with dynamic bounds. Allow partitioning the most minor dimension is also necessary to support the parallelization of matrix-vector multiplication. Adjust shape_partition_test to reflect this change in the shape partition algorithm. PiperOrigin-RevId: 190574615
author: Bixia Zheng <bixia@google.com> 2018-03-26 22:44:27 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-03-26 22:46:51 -0700
commit: 307794e156bc21b2f122bf5e7d907299392023c5 (patch)
tree: 2d3fec94e7fb6de4058e601f32ddbf7f8ac373f4
parent: b16ec315e7e9d41645634398da202629c3baa5af (diff)
3 files changed, 43 insertions, 82 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
index 61b408b8c2..42fe955f19 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -20,12 +20,13 @@ namespace cpu {
 
 std::vector<int64> ShapePartitionAssigner::Run(int64 target_partition_count) {
   // Gather outer-most dims where dim_size >= 'target_partition_count'.
-  // Note: always leave inner-dim static for vectorization/optimizations.
+  // This may include the inner-dim as LLVM can vectorize loops with dynamic
+  // bounds.
   std::vector<int64> outer_dims;
   int64 outer_dim_size = 1;
   // TODO(b/27458679) Consider reserving enough minor dimensions (based on
   // target vector register width) to enable vector instructions.
-  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 1; --i) {
+  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
     const int64 dimension = shape_.layout().minor_to_major(i);
     outer_dims.push_back(dimension);
     outer_dim_size *= shape_.dimensions(dimension);
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index ee0c53fa6d..ae80a6f497 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -30,105 +30,65 @@ class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
-  void RunR2Test(const Shape& shape, const int64 expected_max_partition_count) {
+  void RunR2Test(const Shape& shape, int64 max_target_partition_count,
+                 const std::vector<int64>* expected_partitions) {
     ShapePartitionAssigner assigner(shape);
-    // Check all partitions of outer dimension.
-    for (int64 i = 1; i <= expected_max_partition_count; ++i) {
-      EXPECT_TRUE(ContainersEqual(Vec({i}),
-                                  assigner.Run(/*target_partition_count=*/i)));
+    // Iterate through 1..max_target_partition_count.
+    for (int64 i = 1; i <= max_target_partition_count; ++i) {
+      std::vector<int64> actual_partitions =
+          assigner.Run(/*target_partition_count=*/i);
+      EXPECT_THAT(actual_partitions, expected_partitions[i - 1]);
     }
-    // Check target_partition_count > outer dimension size.
-    EXPECT_TRUE(ContainersEqual(
-        Vec({expected_max_partition_count}),
-        assigner.Run(
-            /*target_partition_count=*/expected_max_partition_count + 1)));
   }
 };
 
 TEST_F(ShapePartitionAssignerTest, Shape13WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 1);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {1, 2} /* 2 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape31WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 1);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {1, 2} /* 2 */
+  };
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 5);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {2} /* 2 */,
+                                              {3} /* 3 */, {4} /* 4 */,
+                                              {5} /* 5 */, {3, 2} /* 6 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 6,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 3);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {2, 2} /* 4 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 4,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout210) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 5; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({4, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {4} /* 4 */,
+      {5} /* 5 */,     {3, 2} /* 6 */,  {3, 2} /* 7 */,  {4, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {4, 3} /* 12 */,
+      {4, 3} /* 13 */, {4, 3} /* 14 */, {5, 3} /* 15 */, {4, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0}), 16,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 3; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/4)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/5)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {2, 2} /* 4 */,
+      {2, 2} /* 5 */,  {3, 2} /* 6 */,  {3, 2} /* 7 */,  {3, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {3, 4} /* 12 */,
+      {3, 4} /* 13 */, {3, 4} /* 14 */, {3, 5} /* 15 */, {3, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1}), 16,
+            expected_partitions);
 }
 
 class ShapePartitionIteratorTest : public HloTestBase {
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 6fe7737de7..b28fe0c15a 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -71,8 +71,8 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
 #ifdef XLA_TEST_BACKEND_CPU
   // TODO(b/73141998): The vectorized Log implementation gives results outside
   // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64):
-  std::pair<int64, int64> known_incorrect_range = {1, 8315654};
+  // floats expressed as a zero extended int64).
+  std::pair<int64, int64> known_incorrect_range = {1, 8388608};
 #else
   std::pair<int64, int64> known_incorrect_range = {0, 0};
 #endif
author	Bixia Zheng <bixia@google.com>	2018-03-26 22:44:27 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-03-26 22:46:51 -0700
commit	307794e156bc21b2f122bf5e7d907299392023c5 (patch)
tree	2d3fec94e7fb6de4058e601f32ddbf7f8ac373f4
parent	b16ec315e7e9d41645634398da202629c3baa5af (diff)