From bd2e5a788ac074535b4f973ac81ac61d4a166288 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:10:40 -0800 Subject: Made sure the number of floating point operations done by a benchmark is computed using 64 bit integers to avoid overflows. --- bench/tensors/tensor_benchmarks.h | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'bench') diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index ba7e7eb48..365504009 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -13,8 +13,6 @@ typedef int TensorIndex; using Eigen::Tensor; using Eigen::TensorMap; -typedef int64_t int64; - // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. template class BenchmarkSuite { @@ -42,7 +40,7 @@ template class BenchmarkSuite { device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); } // Record the number of values copied per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void typeCasting(int num_iters) { @@ -56,7 +54,7 @@ template class BenchmarkSuite { B.device(device_) = A.cast(); } // Record the number of values copied per second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void random(int num_iters) { @@ -69,7 +67,7 @@ template class BenchmarkSuite { C.device(device_) = C.random(); } // Record the number of random numbers generated per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void slicing(int num_iters) { @@ -98,7 +96,7 @@ template class BenchmarkSuite { } // Record the number of values copied from the rhs slice to the lhs slice // each second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void rowChip(int num_iters) { @@ -112,7 +110,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % k_, 0); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void colChip(int num_iters) { @@ -126,7 +124,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % n_, 1); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void shuffling(int num_iters) { @@ -143,7 +141,7 @@ template class BenchmarkSuite { B.device(device_) = A.shuffle(shuffle); } // Record the number of values shuffled from A and copied to B each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void padding(int num_iters) { @@ -162,7 +160,7 @@ template class BenchmarkSuite { B.device(device_) = A.pad(paddings); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void striding(int num_iters) { @@ -179,7 +177,7 @@ template class BenchmarkSuite { B.device(device_) = A.stride(strides); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void broadcasting(int num_iters) { @@ -202,7 +200,7 @@ template class BenchmarkSuite { C.device(device_) = A.broadcast(broadcast); } // Record the number of values broadcasted from A and copied to C each second - finalizeBenchmark(m_ * n_ * num_iters); + finalizeBenchmark(static_cast(m_) * n_ * num_iters); } void coeffWiseOp(int num_iters) { @@ -218,7 +216,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (2 multiplications and // 1 addition per value) - finalizeBenchmark(3 * m_ * m_ * num_iters); + finalizeBenchmark(static_cast(3) * m_ * m_ * num_iters); } void algebraicFunc(int num_iters) { @@ -234,7 +232,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void transcendentalFunc(int num_iters) { @@ -250,7 +248,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } // Row reduction @@ -274,7 +272,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // Column reduction @@ -300,7 +298,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication @@ -322,7 +320,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (size_ multiplications and // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -341,8 +339,8 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) - finalizeBenchmark( - (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + finalizeBenchmark(static_cast(2) * + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: @@ -360,7 +358,7 @@ template class BenchmarkSuite { //BenchmarkUseRealTime(); } - inline void finalizeBenchmark(int64 num_items) { + inline void finalizeBenchmark(int64_t num_items) { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) if (Eigen::internal::is_same::value) { device_.synchronize(); -- cgit v1.2.3