diff options
-rw-r--r-- | tensorflow/compiler/aot/codegen_test_h.golden | 4 | ||||
-rw-r--r-- | tensorflow/compiler/aot/runtime.h | 4 | ||||
-rw-r--r-- | tensorflow/compiler/aot/runtime_test.cc | 16 | ||||
-rw-r--r-- | tensorflow/core/framework/allocator.h | 5 | ||||
-rw-r--r-- | tensorflow/core/framework/tensor_test.cc | 24 | ||||
-rw-r--r-- | tensorflow/core/kernels/scoped_allocator_ops_test.cc | 9 | ||||
-rw-r--r-- | third_party/eigen.BUILD | 1 |
7 files changed, 32 insertions, 31 deletions
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden index 6e050cf564..6641d45e83 100644 --- a/tensorflow/compiler/aot/codegen_test_h.golden +++ b/tensorflow/compiler/aot/codegen_test_h.golden @@ -56,9 +56,9 @@ namespace bar { // // Memory stats: // arg bytes total: 104 -// arg bytes aligned: 128 +// arg bytes aligned: 192 // temp bytes total: 126 -// temp bytes aligned: 224 +// temp bytes aligned: 320 class MyClass : public tensorflow::XlaCompiledCpuFunction { public: // Number of input arguments for the compiled computation. diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h index d085864f00..d1a669ceb1 100644 --- a/tensorflow/compiler/aot/runtime.h +++ b/tensorflow/compiler/aot/runtime.h @@ -25,8 +25,8 @@ namespace tensorflow { namespace tfcompile { namespace runtime { -// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. -static constexpr size_t kAlign = 32; +// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. +static constexpr size_t kAlign = 64; // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1 // values. There are `n` entries in `sizes`. Each buffer is aligned to kAlign diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc index 6d603a02eb..06ec623eb2 100644 --- a/tensorflow/compiler/aot/runtime_test.cc +++ b/tensorflow/compiler/aot/runtime_test.cc @@ -24,7 +24,7 @@ namespace runtime { namespace { TEST(Runtime, AlignmentValue) { - // We've chosen 32 byte alignment for the tfcompile runtime to mimic the + // We've chosen 64 byte alignment for the tfcompile runtime to mimic the // regular tensorflow allocator, which was chosen to play nicely with Eigen. // The tfcompile runtime also has a requirement that comes from the xla // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8 @@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) { EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0); static constexpr intptr_t sizesB[1] = {3}; - EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32); + EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64); static constexpr intptr_t sizesC[1] = {32}; - EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32); + EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64); static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; - EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192); + EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320); } void* add_ptr(void* base, uintptr_t delta) { @@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) { EXPECT_NE(base, nullptr); EXPECT_EQ(bufD[0], add_ptr(base, 0)); EXPECT_EQ(bufD[1], nullptr); - EXPECT_EQ(bufD[2], add_ptr(base, 32)); + EXPECT_EQ(bufD[2], add_ptr(base, 64)); EXPECT_EQ(bufD[3], nullptr); - EXPECT_EQ(bufD[4], add_ptr(base, 64)); - EXPECT_EQ(bufD[5], add_ptr(base, 128)); - EXPECT_EQ(bufD[6], add_ptr(base, 160)); + EXPECT_EQ(bufD[4], add_ptr(base, 128)); + EXPECT_EQ(bufD[5], add_ptr(base, 192)); + EXPECT_EQ(bufD[6], add_ptr(base, 256)); for (int i = 0; i < 7; ++i) { const intptr_t size = sizesD[i]; if (size != -1) { diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 2c87156dca..2bb4d32d57 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -67,13 +67,8 @@ struct AllocatorStats { // device memory. class Allocator { public: -#ifdef EIGEN_VECTORIZE_AVX512 // Align to 64 byte boundary. static constexpr size_t kAllocatorAlignment = 64; -#else - // Align to 32 byte boundary. - static constexpr size_t kAllocatorAlignment = 32; -#endif virtual ~Allocator(); diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc index b613effd18..80e168df97 100644 --- a/tensorflow/core/framework/tensor_test.cc +++ b/tensorflow/core/framework/tensor_test.cc @@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) { // On the alignment. // -// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte +// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte // alignment. Tensor::tensor/flat/vec/matrix methods requires the // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually, -// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure -// its result is aligned if the caller intends to use those methods. -// In this test case, we simply make sure each slice is 32-byte -// aligned: sizeof(float) * 4 * 2 = 32. +// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires +// the caller to ensure its result is aligned if the caller intends +// to use those methods. In this test case, we simply make sure each +// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576. 576 % 64 = 0. TEST(Tensor, Slice_Basic) { Tensor saved; { // General - Tensor x(DT_FLOAT, TensorShape({10, 4, 34})); + Tensor x(DT_FLOAT, TensorShape({10, 4, 36})); // Fills in known values. for (int i = 0; i < 10; ++i) { x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f); } // A simple slice along dim0. Tensor y = x.Slice(4, 8); - EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34}))); + EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36}))); auto tx = x.tensor<float, 3>(); auto ty = y.tensor<float, 3>(); for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j) { - for (int k = 0; k < 34; ++k) { + for (int k = 0; k < 36; ++k) { EXPECT_EQ(ty(i, j, k), 4.0 + i); EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k)); } @@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) { auto tz = z.tensor<float, 3>(); EXPECT_EQ(1, z.dim_size(0)); for (int j = 0; j < 4; ++j) { - for (int k = 0; k < 34; ++k) { + for (int k = 0; k < 36; ++k) { EXPECT_EQ(tz(0, j, k), 6.0); } } @@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) { EXPECT_EQ(1, saved.dim_size(0)); auto tsaved = saved.tensor<float, 3>(); for (int j = 0; j < 4; ++j) { - for (int k = 0; k < 34; ++k) { + for (int k = 0; k < 36; ++k) { EXPECT_EQ(tsaved(0, j, k), 6.0); } } } { // Empty - Tensor x(DT_FLOAT, TensorShape({10, 0, 34})); + Tensor x(DT_FLOAT, TensorShape({10, 0, 36})); x.flat<float>().setRandom(); Tensor y = x.Slice(4, 8); - EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34}))); + EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36}))); } { diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc index 019c6619ee..d2918d2042 100644 --- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc +++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc @@ -212,8 +212,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) { } TEST_F(ScopedAllocatorConcatOpTest, Reshape) { - MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2); - ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}}); + MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2); + + // The elements of the third parameter to ExecOp must be multiples of + // Allocator::kAllocatorAlignment in size. If they are not, the backing + // tensor allocated by PrepOp will have too many elements and reshaping + // will fail. + ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}}); } TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) { diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD index 07bb6645eb..e54c1a4501 100644 --- a/third_party/eigen.BUILD +++ b/third_party/eigen.BUILD @@ -64,6 +64,7 @@ cc_library( # This define (mostly) guarantees we don't link any problematic # code. We use it, but we do not rely on it, as evidenced above. "EIGEN_MPL2_ONLY", + "EIGEN_MAX_ALIGN_BYTES=64", ], includes = ["."], visibility = ["//visibility:public"], |