From 007edee1acb571223f36e62c9ee2e5a95c1a6bab Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jul 2016 12:32:02 +0200
Subject: Add a doc page summarizing the true speed of Eigen's decompositions.

---
 doc/DenseDecompositionBenchmark.dox | 42 +++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 doc/DenseDecompositionBenchmark.dox

(limited to 'doc/DenseDecompositionBenchmark.dox')
diff --git a/doc/DenseDecompositionBenchmark.dox b/doc/DenseDecompositionBenchmark.dox
new file mode 100644
index 000000000..7be9c70cd
--- /dev/null
+++ b/doc/DenseDecompositionBenchmark.dox
@@ -0,0 +1,42 @@
+namespace Eigen {
+
+/** \eigenManualPage DenseDecompositionBenchmark Benchmark of dense decompositions
+
+This page presents a speed comparison of the dense matrix decompositions offered by %Eigen for a wide range of square matrices and overconstrained problems.
+
+For a more general overview on the features and numerical robustness of linear solvers and decompositions, check this \link TopicLinearAlgebraDecompositions table \endlink.
+
+This benchmark has been run on a laptop equipped with an Intel core i7 \@ 2,6 GHz, and compiled with clang with \b AVX and \b FMA instruction sets enabled but without multi-threading.
+It uses \b single \b precision \b float numbers. For double, you can get a good estimate by multiplying the timings by a factor 2.
+
+The square matrices are symmetric, and for the overconstrained matrices, the reported timmings include the cost to compute the symmetric covariance matrix \f$ A^T A \f$ for the first four solvers based on Cholesky and LU, as denoted by the \b * symbol (top-right corner part of the table).
+Timings are in \b milliseconds, and factors are relative to the LLT decomposition which is the fastest but also the least general and robust.
+
+<table class="manual">
+<tr><th>solver/size</th>
+  <th>8x8</th>  <th>100x100</th>  <th>1000x1000</th>  <th>4000x4000</th>  <th>10000x8</th>  <th>10000x100</th>  <th>10000x1000</th>  <th>10000x4000</th></tr>
+<tr><td>LLT</td><td>0.05</td><td>0.42</td><td>5.83</td><td>374.55</td><td>6.79 <sup><a href="#note_ls">*</a></sup></td><td>30.15 <sup><a href="#note_ls">*</a></sup></td><td>236.34 <sup><a href="#note_ls">*</a></sup></td><td>3847.17 <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>LDLT</td><td>0.07 (x1.3)</td><td>0.65 (x1.5)</td><td>26.86 (x4.6)</td><td>2361.18 (x6.3)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.91 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>252.61 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>5807.66 (x1.5) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr><td>PartialPivLU</td><td>0.08 (x1.5)</td><td>0.69 (x1.6)</td><td>15.63 (x2.7)</td><td>709.32 (x1.9)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.32 (x1) <sup><a href="#note_ls">*</a></sup></td><td>241.68 (x1) <sup><a href="#note_ls">*</a></sup></td><td>4270.48 (x1.1) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>FullPivLU</td><td>0.1 (x1.9)</td><td>4.48 (x10.6)</td><td>281.33 (x48.2)</td><td>-</td><td>6.83 (x1) <sup><a href="#note_ls">*</a></sup></td><td>32.67 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>498.25 (x2.1) <sup><a href="#note_ls">*</a></sup></td><td>-</td></tr>
+<tr><td>HouseholderQR</td><td>0.19 (x3.5)</td><td>2.18 (x5.2)</td><td>23.42 (x4)</td><td>1337.52 (x3.6)</td><td>34.26 (x5)</td><td>129.01 (x4.3)</td><td>377.37 (x1.6)</td><td>4839.1 (x1.3)</td></tr>
+<tr class="alt"><td>ColPivHouseholderQR</td><td>0.23 (x4.3)</td><td>2.23 (x5.3)</td><td>103.34 (x17.7)</td><td>9987.16 (x26.7)</td><td>36.05 (x5.3)</td><td>163.18 (x5.4)</td><td>2354.08 (x10)</td><td>37860.5 (x9.8)</td></tr>
+<tr><td>CompleteOrthogonalDecomposition</td><td>0.23 (x4.3)</td><td>2.22 (x5.2)</td><td>99.44 (x17.1)</td><td>10555.3 (x28.2)</td><td>35.75 (x5.3)</td><td>169.39 (x5.6)</td><td>2150.56 (x9.1)</td><td>36981.8 (x9.6)</td></tr>
+<tr class="alt"><td>FullPivHouseholderQR</td><td>0.23 (x4.3)</td><td>4.64 (x11)</td><td>289.1 (x49.6)</td><td>-</td><td>69.38 (x10.2)</td><td>446.73 (x14.8)</td><td>4852.12 (x20.5)</td><td>-</td></tr>
+<tr><td>JacobiSVD</td><td>1.01 (x18.6)</td><td>71.43 (x168.4)</td><td>-</td><td>-</td><td>113.81 (x16.7)</td><td>1179.66 (x39.1)</td><td>-</td><td>-</td></tr>
+<tr class="alt"><td>BDCSVD</td><td>1.07 (x19.7)</td><td>21.83 (x51.5)</td><td>331.77 (x56.9)</td><td>18587.9 (x49.6)</td><td>110.53 (x16.3)</td><td>397.67 (x13.2)</td><td>2975 (x12.6)</td><td>48593.2 (x12.6)</td></tr>
+</table>
+
+<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
+
+\b Observations:
+ + LLT is always the fastest solvers.
+ + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+
+The above table has been generated by the <a href="https://bitbucket.org/eigen/eigen/raw/default/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+
+*/
+
+}
-- 
cgit v1.2.3


solver/size	8x8	100x100	1000x1000	4000x4000	10000x8	10000x100	10000x1000	10000x4000
LLT	0.05	0.42	5.83	374.55	6.79 ^*	30.15 ^*	236.34 ^*	3847.17 ^*
LDLT	0.07 (x1.3)	0.65 (x1.5)	26.86 (x4.6)	2361.18 (x6.3)	6.81 (x1) ^*	31.91 (x1.1) ^*	252.61 (x1.1) ^*	5807.66 (x1.5) ^*
PartialPivLU	0.08 (x1.5)	0.69 (x1.6)	15.63 (x2.7)	709.32 (x1.9)	6.81 (x1) ^*	31.32 (x1) ^*	241.68 (x1) ^*	4270.48 (x1.1) ^*
FullPivLU	0.1 (x1.9)	4.48 (x10.6)	281.33 (x48.2)	-	6.83 (x1) ^*	32.67 (x1.1) ^*	498.25 (x2.1) ^*	-
HouseholderQR	0.19 (x3.5)	2.18 (x5.2)	23.42 (x4)	1337.52 (x3.6)	34.26 (x5)	129.01 (x4.3)	377.37 (x1.6)	4839.1 (x1.3)
ColPivHouseholderQR	0.23 (x4.3)	2.23 (x5.3)	103.34 (x17.7)	9987.16 (x26.7)	36.05 (x5.3)	163.18 (x5.4)	2354.08 (x10)	37860.5 (x9.8)
CompleteOrthogonalDecomposition	0.23 (x4.3)	2.22 (x5.2)	99.44 (x17.1)	10555.3 (x28.2)	35.75 (x5.3)	169.39 (x5.6)	2150.56 (x9.1)	36981.8 (x9.6)
FullPivHouseholderQR	0.23 (x4.3)	4.64 (x11)	289.1 (x49.6)	-	69.38 (x10.2)	446.73 (x14.8)	4852.12 (x20.5)	-
JacobiSVD	1.01 (x18.6)	71.43 (x168.4)	-	-	113.81 (x16.7)	1179.66 (x39.1)	-	-
BDCSVD	1.07 (x19.7)	21.83 (x51.5)	331.77 (x56.9)	18587.9 (x49.6)	110.53 (x16.3)	397.67 (x13.2)	2975 (x12.6)	48593.2 (x12.6)