7 files changed, 32 insertions, 31 deletions
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf564..6641d45e83 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f00..d1a669ceb1 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb..06ec623eb2 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..2bb4d32d57 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18..80e168df97 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 019c6619ee..d2918d2042 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -212,8 +212,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645eb..e54c1a4501 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,7 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],