From 32b08ac971b2ab66cf83360a9e2d42a99bfe3b70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 17 Jul 2009 16:22:39 +0200 Subject: re-implement stableNorm using a homemade blocky and vectorization friendly algorithm (slow if no vectorization) --- bench/bench_norm.cpp | 106 +++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 50 deletions(-) (limited to 'bench/bench_norm.cpp') diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp index 8e4fefdd7..7a3dc2e68 100644 --- a/bench/bench_norm.cpp +++ b/bench/bench_norm.cpp @@ -13,7 +13,7 @@ EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v) template EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v) { - return v.stableNorm(); + return v.hypotNorm(); } template @@ -56,23 +56,7 @@ EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v) template EIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v) { - const int blockSize = 4096; - typedef typename T::Scalar Scalar; - Scalar s = 0; - Scalar ssq = 0; - for (int bi=0; bi::type,Eigen::Dynamic,1,Eigen::ForceAligned> sv(v,bi,0,r,1); - Scalar m = sv.cwise().abs().maxCoeff(); - if (m>s) - { - ssq = ssq * ei_abs2(s/m); - s = m; - } - ssq += (sv/s).squaredNorm(); - } - return s*ei_sqrt(ssq); + return v.stableNorm(); } template @@ -163,6 +147,19 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) Packet ax_s1m = ei_pmul(ax,ps1m); Packet maskBig = ei_plt(pb2,ax); Packet maskSml = ei_plt(ax,pb1); + +// Packet maskMed = ei_pand(maskSml,maskBig); +// Packet scale = ei_pset1(Scalar(0)); +// scale = ei_por(scale, ei_pand(maskBig,ps2m)); +// scale = ei_por(scale, ei_pand(maskSml,ps1m)); +// scale = ei_por(scale, ei_pandnot(ei_pset1(Scalar(1)),maskMed)); +// ax = ei_pmul(ax,scale); +// ax = ei_pmul(ax,ax); +// pabig = ei_padd(pabig, ei_pand(maskBig, ax)); +// pasml = ei_padd(pasml, ei_pand(maskSml, ax)); +// pamed = ei_padd(pamed, ei_pandnot(ax,maskMed)); + + pabig = ei_padd(pabig, ei_pand(maskBig, ei_pmul(ax_s2m,ax_s2m))); pasml = ei_padd(pasml, ei_pand(maskSml, ei_pmul(ax_s1m,ax_s1m))); pamed = ei_padd(pamed, ei_pandnot(ei_pmul(ax,ax),ei_pand(maskSml,maskBig))); @@ -215,7 +212,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) } #define BENCH_PERF(NRM) { \ - Eigen::BenchTimer tf, td; tf.reset(); td.reset();\ + Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\ for (int k=0; k()) << "\t" << blueNorm(vd.cast()) << "\n"; std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast()) << "\t" << lapackNorm(vd.cast()) << "\n"; std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast()) << "\t" << twopassNorm(vd.cast()) << "\n"; - std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast()) << "\t" << bl2passNorm(vd.cast()) << "\n"; +// std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast()) << "\t" << bl2passNorm(vd.cast()) << "\n"; } int main(int argc, char** argv) @@ -273,34 +275,7 @@ int main(int argc, char** argv) double y = 1.1345743233455785456788e12 * ei_random(); VectorXf v = VectorXf::Ones(1024) * y; - std::cerr << "Performance (out of cache):\n"; - { - int iters = 1; - VectorXf vf = VectorXf::Random(1024*1024*32) * y; - VectorXd vd = VectorXd::Random(1024*1024*32) * y; - BENCH_PERF(sqsumNorm); - BENCH_PERF(blueNorm); - BENCH_PERF(pblueNorm); - BENCH_PERF(lapackNorm); - BENCH_PERF(hypotNorm); - BENCH_PERF(twopassNorm); - BENCH_PERF(bl2passNorm); - } - - std::cerr << "\nPerformance (in cache):\n"; - { - int iters = 100000; - VectorXf vf = VectorXf::Random(512) * y; - VectorXd vd = VectorXd::Random(512) * y; - BENCH_PERF(sqsumNorm); - BENCH_PERF(blueNorm); - BENCH_PERF(pblueNorm); - BENCH_PERF(lapackNorm); - BENCH_PERF(hypotNorm); - BENCH_PERF(twopassNorm); - BENCH_PERF(bl2passNorm); - } -return 0; +// return 0; int s = 10000; double basef_ok = 1.1345743233455785456788e15; double based_ok = 1.1345743233455785456788e95; @@ -317,7 +292,7 @@ return 0; check_accuracy(basef_ok, based_ok, s); std::cerr << "\nUnderflow:\n"; - check_accuracy(basef_under, based_under, 1); + check_accuracy(basef_under, based_under, s); std::cerr << "\nOverflow:\n"; check_accuracy(basef_over, based_over, s); @@ -335,4 +310,35 @@ return 0; check_accuracy_var(-27,20,-302,-190,s); std::cout << "\n"; } + + std::cout.precision(4); + std::cerr << "Performance (out of cache):\n"; + { + int iters = 1; + VectorXf vf = VectorXf::Random(1024*1024*32) * y; + VectorXd vd = VectorXd::Random(1024*1024*32) * y; + VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y; + BENCH_PERF(sqsumNorm); + BENCH_PERF(blueNorm); +// BENCH_PERF(pblueNorm); +// BENCH_PERF(lapackNorm); +// BENCH_PERF(hypotNorm); +// BENCH_PERF(twopassNorm); + BENCH_PERF(bl2passNorm); + } + + std::cerr << "\nPerformance (in cache):\n"; + { + int iters = 100000; + VectorXf vf = VectorXf::Random(512) * y; + VectorXd vd = VectorXd::Random(512) * y; + VectorXcf vcf = VectorXcf::Random(512) * y; + BENCH_PERF(sqsumNorm); + BENCH_PERF(blueNorm); +// BENCH_PERF(pblueNorm); +// BENCH_PERF(lapackNorm); +// BENCH_PERF(hypotNorm); +// BENCH_PERF(twopassNorm); + BENCH_PERF(bl2passNorm); + } } -- cgit v1.2.3