From ccc7b0ffead13a179cae85bb5fff5e229931c37d Mon Sep 17 00:00:00 2001 From: nnyby Date: Thu, 1 Oct 2015 23:43:06 +0000 Subject: [doc] grammar fix: "linearly space" -> "linearly spaced" --- Eigen/src/Core/CwiseNullaryOp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 2bc6933d9..3361d0d76 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -224,7 +224,7 @@ DenseBase::Constant(const Scalar& value) } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * This particular version of LinSpaced() uses sequential access, i.e. vector access is @@ -262,7 +262,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. @@ -375,7 +375,7 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. @@ -395,7 +395,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function fill *this with equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. -- cgit v1.2.3 From fa4f933c0fe65eda6a051f978db12210f11f5cdb Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 7 Dec 2015 15:24:49 -0800 Subject: Add special functions to Eigen: lgamma, erf, erfc. Includes CUDA support and unit tests. --- Eigen/Core | 1 + Eigen/src/Core/GenericPacketMath.h | 15 +++ Eigen/src/Core/GlobalFunctions.h | 3 + Eigen/src/Core/SpecialFunctions.h | 144 ++++++++++++++++++++++++ Eigen/src/Core/arch/CUDA/MathFunctions.h | 37 ++++++ Eigen/src/Core/arch/CUDA/PacketMath.h | 6 + Eigen/src/Core/functors/UnaryFunctors.h | 72 ++++++++++++ Eigen/src/Core/util/ForwardDeclarations.h | 6 + Eigen/src/Core/util/StaticAssert.h | 3 +- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 44 ++++++++ test/array.cpp | 3 + test/packetmath.cpp | 23 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 18 +++ unsupported/test/cxx11_tensor_cuda.cpp | 139 +++++++++++++++++++++++ 14 files changed, 513 insertions(+), 1 deletion(-) create mode 100644 Eigen/src/Core/SpecialFunctions.h diff --git a/Eigen/Core b/Eigen/Core index 1ec749452..63602f4c3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -300,6 +300,7 @@ using std::ptrdiff_t; #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" +#include "src/Core/SpecialFunctions.h" #include "src/Core/GenericPacketMath.h" #if defined EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 5f27d8166..0e7dd29ed 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -74,6 +74,9 @@ struct default_packet_traits HasSinh = 0, HasCosh = 0, HasTanh = 0, + HasLGamma = 0, + HasErf = 0, + HasErfc = 0 HasRound = 0, HasFloor = 0, @@ -432,6 +435,18 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plgamma(const Packet& a) { return numext::lgamma(a); } + +/** \internal \returns the erf(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perf(const Packet& a) { return numext::erf(a); } + +/** \internal \returns the erfc(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perfc(const Packet& a) { return numext::erfc(a); } + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 585974809..62fec7008 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -49,6 +49,9 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h new file mode 100644 index 000000000..d481f2e06 --- /dev/null +++ b/Eigen/src/Core/SpecialFunctions.h @@ -0,0 +1,144 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2006-2010 Benoit Jacob +// Copyright (C) 2015 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIAL_FUNCTIONS_H +#define EIGEN_SPECIAL_FUNCTIONS_H + +namespace Eigen { + +namespace internal { + +template +EIGEN_STRONG_INLINE Scalar __lgamma(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __lgamma(float x) { return lgammaf(x); } +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __lgamma(double x) { return lgamma(x); } + +template +EIGEN_STRONG_INLINE Scalar __erf(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __erf(float x) { return erff(x); } +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __erf(double x) { return erf(x); } + +template +EIGEN_STRONG_INLINE Scalar __erfc(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __erfc(float x) { return erfcf(x); } +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __erfc(double x) { return erfc(x); } + +} // end namespace internal + +/**************************************************************************** + * Implementations * + ****************************************************************************/ + +namespace internal { + +/**************************************************************************** + * Implementation of + * lgamma * + ****************************************************************************/ + +template +struct lgamma_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + { + return __lgamma(x); + } +}; + +template +struct lgamma_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** + * Implementation of + * erf * + ****************************************************************************/ + +template +struct erf_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + { + return __erf(x); + } +}; + +template +struct erf_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of erfc * +****************************************************************************/ + +template +struct erfc_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + { + return __erfc(x); + } +}; + +template +struct erfc_retval +{ + typedef Scalar type; +}; + +} // end namespace internal + +namespace numext { + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); +} + +} // end namespace numext + +} // end namespace Eigen + +#endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 3bea88bea..ecd5c444e 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -66,6 +66,43 @@ double2 prsqrt(const double2& a) return make_double2(rsqrt(a.x), rsqrt(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma(const double2& a) +{ + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf(const float4& a) +{ + return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf(const double2& a) +{ + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc(const float4& a) +{ + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc(const double2& a) +{ + return make_double2(erfc(a.x), erfc(a.y)); +} + + #endif } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 0d2c2fef0..cb1b547e0 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -39,6 +39,9 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasErf = 1, + HasErfc = 1, HasBlend = 0, }; @@ -59,6 +62,9 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasErf = 1, + HasErfc = 1, HasBlend = 0, }; diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index e6c665fb6..e16bdd589 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -403,6 +403,77 @@ struct functor_traits > }; }; + +/** \internal + * \brief Template functor to compute the natural log of the absolute + * value of Gamma of a scalar + * \sa class CwiseUnaryOp, Cwise::lgamma() + */ +template struct scalar_lgamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::lgamma; return lgamma(a); + } + typedef typename packet_traits::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasLGamma + }; +}; + +/** \internal + * \brief Template functor to compute the Gauss error function of a + * scalar + * \sa class CwiseUnaryOp, Cwise::erf() + */ +template struct scalar_erf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erf; return erf(a); + } + typedef typename packet_traits::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::perf(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasErf + }; +}; + +/** \internal + * \brief Template functor to compute the Complementary Error Function + * of a scalar + * \sa class CwiseUnaryOp, Cwise::erfc() + */ +template struct scalar_erfc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erfc; return erfc(a); + } + typedef typename packet_traits::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasErfc + }; +}; + + /** \internal * \brief Template functor to compute the atan of a scalar * \sa class CwiseUnaryOp, ArrayBase::atan() @@ -422,6 +493,7 @@ struct functor_traits > }; }; + /** \internal * \brief Template functor to compute the tanh of a scalar * \sa class CwiseUnaryOp, ArrayBase::tanh() diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 483af876f..27c7907fc 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -294,6 +294,12 @@ struct stem_function }; } +// SpecialFunctions forward declarations +namespace internal { +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __lgamma(Scalar x); +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __erf(Scalar x); +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __erfc(Scalar x); + } // end namespace Eigen #endif // EIGEN_FORWARDDECLARATIONS_H diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 108181419..1fe365aa7 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -96,7 +96,8 @@ STORAGE_LAYOUT_DOES_NOT_MATCH, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, + THIS_TYPE_IS_NOT_SUPPORTED }; }; diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 45e826b0c..ed9818dd1 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -21,6 +21,9 @@ typedef CwiseUnaryOp, const Derived> AtanReturn typedef CwiseUnaryOp, const Derived> TanhReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; typedef CwiseUnaryOp, const Derived> CoshReturnType; +typedef CwiseUnaryOp, const Derived> LgammaReturnType; +typedef CwiseUnaryOp, const Derived> ErfReturnType; +typedef CwiseUnaryOp, const Derived> ErfcReturnType; typedef CwiseUnaryOp, const Derived> PowReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; @@ -302,6 +305,47 @@ cosh() const return CoshReturnType(derived()); } +/** \returns an expression of the coefficient-wise ln(|gamma(*this)|). + * + * Example: \include Cwise_lgamma.cpp + * Output: \verbinclude Cwise_lgamma.out + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp, Derived> +lgamma() const +{ + return LgammaReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise Gauss error + * function of *this. + * + * Example: \include Cwise_erf.cpp + * Output: \verbinclude Cwise_erf.out + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp, Derived> +erf() const +{ + return ErfReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise Complementary error + * function of *this. + * + * Example: \include Cwise_erfc.cpp + * Output: \verbinclude Cwise_erfc.out + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp, Derived> +erfc() const +{ + return ErfcReturnType(derived()); +} + /** \returns an expression of the coefficient-wise power of *this to the given exponent. * * This function computes the coefficient-wise power. The function MatrixBase::pow() in the diff --git a/test/array.cpp b/test/array.cpp index 5395721f5..9994c23c3 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -217,6 +217,9 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); + VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); + VERIFY_IS_APPROX(m1.erf(), erf(m1)); + VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); VERIFY_IS_APPROX(m1.round(), round(m1)); VERIFY_IS_APPROX(m1.floor(), floor(m1)); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index b6616ac5e..304fab5de 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -351,6 +351,25 @@ template void packetmath_real() VERIFY_IS_EQUAL(std::exp(-std::numeric_limits::denorm_min()), data2[1]); } + { + data1[0] = std::numeric_limits::quiet_NaN(); + packet_helper::HasLGamma,Packet> h; + h.store(data2, internal::plgamma(h.load(data1))); + VERIFY(std::isnan(data2[0])); + } + { + data1[0] = std::numeric_limits::quiet_NaN(); + packet_helper::HasErf,Packet> h; + h.store(data2, internal::perf(h.load(data1))); + VERIFY(std::isnan(data2[0])); + } + { + data1[0] = std::numeric_limits::quiet_NaN(); + packet_helper::HasErfc,Packet> h; + h.store(data2, internal::perfc(h.load(data1))); + VERIFY(std::isnan(data2[0])); + } + for (int i=0; i(0,1) * std::pow(Scalar(10), internal::random(-6,6)); @@ -360,6 +379,10 @@ template void packetmath_real() data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); + CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); + CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); + CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); + if(PacketTraits::HasLog && PacketTraits::size>=2) { data1[0] = std::numeric_limits::quiet_NaN(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index d1ce3d0ed..392acf302 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -122,6 +122,24 @@ class TensorBase return unaryExpr(internal::scalar_tanh_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> sigmoid() const { diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 5ff082a3a..49e1894ab 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -507,6 +507,115 @@ static void test_cuda_convolution_3d() } } + +template +void test_cuda_lgamma(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.lgamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); + } + } +} + +template +void test_cuda_erf(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erf(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); + } + } +} + +template +void test_cuda_erfc(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erfc(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); + } + } +} + void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -522,4 +631,34 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_convolution_2d()); CALL_SUBTEST(test_cuda_convolution_3d()); CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_lgamma(1.0f)); + CALL_SUBTEST(test_cuda_lgamma(100.0f)); + CALL_SUBTEST(test_cuda_lgamma(0.01f)); + CALL_SUBTEST(test_cuda_lgamma(0.001f)); + CALL_SUBTEST(test_cuda_erf(1.0f)); + CALL_SUBTEST(test_cuda_erf(100.0f)); + CALL_SUBTEST(test_cuda_erf(0.01f)); + CALL_SUBTEST(test_cuda_erf(0.001f)); + CALL_SUBTEST(test_cuda_erfc(1.0f)); + // CALL_SUBTEST(test_cuda_erfc(100.0f)); + CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST(test_cuda_erfc(0.01f)); + CALL_SUBTEST(test_cuda_erfc(0.001f)); + CALL_SUBTEST(test_cuda_tanh(1.0)); + CALL_SUBTEST(test_cuda_tanh(100.0)); + CALL_SUBTEST(test_cuda_tanh(0.01)); + CALL_SUBTEST(test_cuda_tanh(0.001)); + CALL_SUBTEST(test_cuda_lgamma(1.0)); + CALL_SUBTEST(test_cuda_lgamma(100.0)); + CALL_SUBTEST(test_cuda_lgamma(0.01)); + CALL_SUBTEST(test_cuda_lgamma(0.001)); + CALL_SUBTEST(test_cuda_erf(1.0)); + CALL_SUBTEST(test_cuda_erf(100.0)); + CALL_SUBTEST(test_cuda_erf(0.01)); + CALL_SUBTEST(test_cuda_erf(0.001)); + CALL_SUBTEST(test_cuda_erfc(1.0)); + // CALL_SUBTEST(test_cuda_erfc(100.0)); + CALL_SUBTEST(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST(test_cuda_erfc(0.01)); + CALL_SUBTEST(test_cuda_erfc(0.001)); } -- cgit v1.2.3 From 73b68d4370f761d6422e02e7e515aefdcd652c1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Dec 2015 16:38:48 -0800 Subject: Fixed a couple of typos Cleaned up the code a bit. --- Eigen/src/Core/GenericPacketMath.h | 4 ++-- Eigen/src/Core/SpecialFunctions.h | 13 +++++-------- Eigen/src/Core/util/ForwardDeclarations.h | 6 ------ test/packetmath.cpp | 9 ++++++--- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 0e7dd29ed..6872f5e53 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -43,7 +43,7 @@ struct default_packet_traits { enum { HasHalfPacket = 0, - + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -76,7 +76,7 @@ struct default_packet_traits HasTanh = 0, HasLGamma = 0, HasErf = 0, - HasErfc = 0 + HasErfc = 0, HasRound = 0, HasFloor = 0, diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index d481f2e06..ae8f0105a 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -1,7 +1,6 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2006-2010 Benoit Jacob // Copyright (C) 2015 Eugene Brevdo // // This Source Code Form is subject to the terms of the Mozilla @@ -45,14 +44,13 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __erfc(double x } // end namespace internal /**************************************************************************** - * Implementations * + * Implementations * ****************************************************************************/ namespace internal { /**************************************************************************** - * Implementation of - * lgamma * + * Implementation of lgamma * ****************************************************************************/ template @@ -72,8 +70,7 @@ struct lgamma_retval }; /**************************************************************************** - * Implementation of - * erf * + * Implementation of erf * ****************************************************************************/ template @@ -92,8 +89,8 @@ struct erf_retval typedef Scalar type; }; -/**************************************************************************** -* Implementation of erfc * +/*************************************************************************** +* Implementation of erfc * ****************************************************************************/ template diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 27c7907fc..483af876f 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -294,12 +294,6 @@ struct stem_function }; } -// SpecialFunctions forward declarations -namespace internal { -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __lgamma(Scalar x); -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __erf(Scalar x); -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar __erfc(Scalar x); - } // end namespace Eigen #endif // EIGEN_FORWARDDECLARATIONS_H diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 304fab5de..c34b6f3f1 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -355,19 +355,19 @@ template void packetmath_real() data1[0] = std::numeric_limits::quiet_NaN(); packet_helper::HasLGamma,Packet> h; h.store(data2, internal::plgamma(h.load(data1))); - VERIFY(std::isnan(data2[0])); + VERIFY((numext::isnan)(data2[0])); } { data1[0] = std::numeric_limits::quiet_NaN(); packet_helper::HasErf,Packet> h; h.store(data2, internal::perf(h.load(data1))); - VERIFY(std::isnan(data2[0])); + VERIFY((numext::isnan)(data2[0])); } { data1[0] = std::numeric_limits::quiet_NaN(); packet_helper::HasErfc,Packet> h; h.store(data2, internal::perfc(h.load(data1))); - VERIFY(std::isnan(data2[0])); + VERIFY((numext::isnan)(data2[0])); } for (int i=0; i void packetmath_real() data1[i] = internal::random(0,1) * std::pow(Scalar(10), internal::random(-6,6)); data2[i] = internal::random(0,1) * std::pow(Scalar(10), internal::random(-6,6)); } + +#if __cplusplus > 199711L if(internal::random(0,1)<0.1) data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); @@ -382,6 +384,7 @@ template void packetmath_real() CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); +#endif if(PacketTraits::HasLog && PacketTraits::size>=2) { -- cgit v1.2.3 From b1ae39794cee2536d28360acd2ea6291806debe1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Dec 2015 16:46:35 -0800 Subject: Simplified the code a bit --- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index ed9818dd1..01432e2f3 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -312,7 +312,7 @@ cosh() const * * \sa cos(), sin(), tan() */ -inline const CwiseUnaryOp, Derived> +inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); @@ -326,7 +326,7 @@ lgamma() const * * \sa cos(), sin(), tan() */ -inline const CwiseUnaryOp, Derived> +inline const ErfReturnType erf() const { return ErfReturnType(derived()); @@ -340,7 +340,7 @@ erf() const * * \sa cos(), sin(), tan() */ -inline const CwiseUnaryOp, Derived> +inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); -- cgit v1.2.3 From b630d10b62d4338181a49272d5dd57381964d3a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Dec 2015 17:08:08 -0800 Subject: Only disable the erf, erfc, and lgamma tests for older versions of c++. --- test/packetmath.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index c34b6f3f1..758c2fb9d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -376,11 +376,11 @@ template void packetmath_real() data2[i] = internal::random(0,1) * std::pow(Scalar(10), internal::random(-6,6)); } -#if __cplusplus > 199711L if(internal::random(0,1)<0.1) data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); +#if __cplusplus > 199711L CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); -- cgit v1.2.3 From e535450573bf8a15d63cc0dff6090a89f28cf8cb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Dec 2015 14:06:39 -0800 Subject: Cleanup --- Eigen/src/Core/GenericPacketMath.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6872f5e53..8ad51bad5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -437,15 +437,15 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } /** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plgamma(const Packet& a) { return numext::lgamma(a); } +Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } /** \internal \returns the erf(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet perf(const Packet& a) { return numext::erf(a); } +Packet perf(const Packet& a) { using numext::erf; return erf(a); } /** \internal \returns the erfc(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet perfc(const Packet& a) { return numext::erfc(a); } +Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } /*************************************************************************** * The following functions might not have to be overwritten for vectorized types -- cgit v1.2.3 From 53b196aa5fb503ab3707887eea226eec56943380 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Dec 2015 14:17:34 -0800 Subject: Simplified the implementation of lgamma, erf, and erfc --- Eigen/src/Core/SpecialFunctions.h | 87 +++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index ae8f0105a..f90f1b81b 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -11,42 +11,6 @@ #define EIGEN_SPECIAL_FUNCTIONS_H namespace Eigen { - -namespace internal { - -template -EIGEN_STRONG_INLINE Scalar __lgamma(Scalar x) { - EIGEN_STATIC_ASSERT((internal::is_same::value == false), - THIS_TYPE_IS_NOT_SUPPORTED); -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __lgamma(float x) { return lgammaf(x); } -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __lgamma(double x) { return lgamma(x); } - -template -EIGEN_STRONG_INLINE Scalar __erf(Scalar x) { - EIGEN_STATIC_ASSERT((internal::is_same::value == false), - THIS_TYPE_IS_NOT_SUPPORTED); -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __erf(float x) { return erff(x); } -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __erf(double x) { return erf(x); } - -template -EIGEN_STRONG_INLINE Scalar __erfc(Scalar x) { - EIGEN_STATIC_ASSERT((internal::is_same::value == false), - THIS_TYPE_IS_NOT_SUPPORTED); -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float __erfc(float x) { return erfcf(x); } -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double __erfc(double x) { return erfc(x); } - -} // end namespace internal - -/**************************************************************************** - * Implementations * - ****************************************************************************/ - namespace internal { /**************************************************************************** @@ -59,10 +23,25 @@ struct lgamma_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) { - return __lgamma(x); + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); } }; +template<> +struct lgamma_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); } +}; + +template<> +struct lgamma_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); } +}; + template struct lgamma_retval { @@ -79,10 +58,25 @@ struct erf_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) { - return __erf(x); + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); } }; +template<> +struct erf_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); } +}; + +template<> +struct erf_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); } +}; + template struct erf_retval { @@ -99,10 +93,25 @@ struct erfc_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) { - return __erfc(x); + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); } }; +template<> +struct erfc_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } +}; + +template<> +struct erfc_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } +}; + template struct erfc_retval { -- cgit v1.2.3 From 48877a69334382b8478f5095c5e56500b7de7478 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 13:09:49 -0800 Subject: Only implement the lgamma, erf, and erfc functions when using a compiler compliant with the C99 specification. --- Eigen/src/Core/SpecialFunctions.h | 43 +++++++++++++++++++++++---------------- test/packetmath.cpp | 4 +++- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index f90f1b81b..1de3d7f78 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -28,6 +28,13 @@ struct lgamma_impl } }; +template +struct lgamma_retval +{ + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH template<> struct lgamma_impl { @@ -41,12 +48,7 @@ struct lgamma_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); } }; - -template -struct lgamma_retval -{ - typedef Scalar type; -}; +#endif /**************************************************************************** * Implementation of erf * @@ -63,6 +65,13 @@ struct erf_impl } }; +template +struct erf_retval +{ + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH template<> struct erf_impl { @@ -76,12 +85,7 @@ struct erf_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); } }; - -template -struct erf_retval -{ - typedef Scalar type; -}; +#endif // EIGEN_HAS_C99_MATH /*************************************************************************** * Implementation of erfc * @@ -98,6 +102,13 @@ struct erfc_impl } }; +template +struct erfc_retval +{ + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH template<> struct erfc_impl { @@ -111,15 +122,11 @@ struct erfc_impl EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } }; - -template -struct erfc_retval -{ - typedef Scalar type; -}; +#endif // EIGEN_HAS_C99_MATH } // end namespace internal + namespace numext { template diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 758c2fb9d..91bb998d0 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -351,6 +351,7 @@ template void packetmath_real() VERIFY_IS_EQUAL(std::exp(-std::numeric_limits::denorm_min()), data2[1]); } +#ifdef EIGEN_HAS_C99_MATH { data1[0] = std::numeric_limits::quiet_NaN(); packet_helper::HasLGamma,Packet> h; @@ -369,6 +370,7 @@ template void packetmath_real() h.store(data2, internal::perfc(h.load(data1))); VERIFY((numext::isnan)(data2[0])); } +#endif // EIGEN_HAS_C99_MATH for (int i=0; i void packetmath_real() data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); -#if __cplusplus > 199711L +#if defined(EIGEN_HAS_C99_MATH) && (__cplusplus > 199711L) CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); -- cgit v1.2.3 From 58e06447dec67e265fb0a749e60f67ecd831b32b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 13:11:36 -0800 Subject: Silence a compilation warning --- Eigen/src/Core/SpecialFunctions.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 1de3d7f78..05973e372 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -25,6 +25,7 @@ struct lgamma_impl { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); } }; @@ -62,6 +63,7 @@ struct erf_impl { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); } }; @@ -99,6 +101,7 @@ struct erfc_impl { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); } }; -- cgit v1.2.3 From 8314962ce2ce5e7cd8c591b7a0a7039abd83f5c6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 13:13:45 -0800 Subject: Only test the lgamma, erf and erfc function when using a C99 compliant compiler --- test/array.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/array.cpp b/test/array.cpp index 9994c23c3..6adedfb06 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -202,7 +202,7 @@ template void array_real(const ArrayType& m) m2 = ArrayType::Random(rows, cols), m3(rows, cols), m4 = m1; - + m4 = (m4.abs()==Scalar(0)).select(1,m4); Scalar s1 = internal::random(); @@ -217,9 +217,11 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); +#ifdef EIGEN_HAS_C99_MATH VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); VERIFY_IS_APPROX(m1.erf(), erf(m1)); VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); +#endif // EIGEN_HAS_C99_MATH VERIFY_IS_APPROX(m1.arg(), arg(m1)); VERIFY_IS_APPROX(m1.round(), round(m1)); VERIFY_IS_APPROX(m1.floor(), floor(m1)); -- cgit v1.2.3 From d1862967a89501f0382834e0d128a53ad5764377 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 10 Dec 2015 22:23:21 +0100 Subject: Make sure ADOLC is recent enough by searching for adtl.h --- cmake/FindAdolc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FindAdolc.cmake b/cmake/FindAdolc.cmake index 1a7ff3628..937e54990 100644 --- a/cmake/FindAdolc.cmake +++ b/cmake/FindAdolc.cmake @@ -5,7 +5,7 @@ endif (ADOLC_INCLUDES AND ADOLC_LIBRARIES) find_path(ADOLC_INCLUDES NAMES - adolc/adouble.h + adolc/adtl.h PATHS $ENV{ADOLCDIR} ${INCLUDE_INSTALL_DIR} -- cgit v1.2.3 From df6f54ff63fbf8ec4bd6218d9887351b30dda30f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 10 Dec 2015 22:24:58 +0100 Subject: Fix storage order of PartialRedux --- Eigen/src/Core/CoreEvaluators.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 42ad452f7..f97dc33de 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -994,7 +994,7 @@ struct evaluator > CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&HereditaryBits), + Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))), Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized }; -- cgit v1.2.3 From b820b097b870f96538f87862bb3cf22d2b3f4b3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 13:52:05 -0800 Subject: Created EIGEN_HAS_C99_MATH define as Gael suggested. --- Eigen/src/Core/util/Macros.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8def69610..d375c77dd 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -341,6 +341,13 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support C99? +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ + || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define EIGEN_HAS_C99_MATH 1 +#endif + // Does the compiler support result_of? #if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)) #define EIGEN_HAS_STD_RESULT_OF 1 -- cgit v1.2.3 From 6acf2bd4725a3394c40e1b542ae03a9c6fbb9a2c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 17:17:42 -0800 Subject: Fixed compilation error triggered by MSVC 2008 --- test/packetmath.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 91bb998d0..bf2e3fecc 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -336,7 +336,7 @@ template void packetmath_real() data1[1] = 0; h.store(data2, internal::pexp(h.load(data1))); VERIFY_IS_EQUAL(std::exp(-std::numeric_limits::epsilon()), data2[0]); - VERIFY_IS_EQUAL(std::exp(0), data2[1]); + VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]); data1[0] = (std::numeric_limits::min)(); data1[1] = -(std::numeric_limits::min)(); @@ -401,7 +401,7 @@ template void packetmath_real() data1[1] = 0; h.store(data2, internal::plog(h.load(data1))); VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::log(0), data2[1]); + VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]); data1[0] = (std::numeric_limits::min)(); data1[1] = -(std::numeric_limits::min)(); -- cgit v1.2.3 From 4e324ca6ae1ae7b60e18227bbfdde9a0380e90e7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 20:47:25 -0800 Subject: Updated the cxx11_tensor_assign test to make it compile without support for cxx11 --- unsupported/test/cxx11_tensor_assign.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index d16aaf847..e5cf61fe1 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -29,8 +29,8 @@ static void test_1d() int row_major[6]; memset(col_major, 0, 6*sizeof(int)); memset(row_major, 0, 6*sizeof(int)); - TensorMap> vec3(col_major, 6); - TensorMap> vec4(row_major, 6); + TensorMap > vec3(col_major, 6); + TensorMap > vec4(row_major, 6); vec3 = vec1; vec4 = vec2; @@ -92,8 +92,8 @@ static void test_2d() int row_major[6]; memset(col_major, 0, 6*sizeof(int)); memset(row_major, 0, 6*sizeof(int)); - TensorMap> mat3(row_major, 2, 3); - TensorMap> mat4(col_major, 2, 3); + TensorMap > mat3(row_major, 2, 3); + TensorMap > mat4(col_major, 2, 3); mat3 = mat1; mat4 = mat2; @@ -152,8 +152,8 @@ static void test_3d() int row_major[2*3*7]; memset(col_major, 0, 2*3*7*sizeof(int)); memset(row_major, 0, 2*3*7*sizeof(int)); - TensorMap> mat3(col_major, 2, 3, 7); - TensorMap> mat4(row_major, 2, 3, 7); + TensorMap > mat3(col_major, 2, 3, 7); + TensorMap > mat4(row_major, 2, 3, 7); mat3 = mat1; mat4 = mat2; -- cgit v1.2.3 From 9db8316c936b2d83e2b6484b681b275f9cccae95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 20:53:44 -0800 Subject: Updated the cxx11_tensor_custom_op to not require cxx11. --- unsupported/test/cxx11_tensor_custom_op.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp index 7e33c9580..8baa477cc 100644 --- a/unsupported/test/cxx11_tensor_custom_op.cpp +++ b/unsupported/test/cxx11_tensor_custom_op.cpp @@ -25,7 +25,9 @@ struct InsertZeros { template void eval(const Tensor& input, Output& output, const Device& device) const { - array strides{{2, 2}}; + array strides; + strides[0] = 2; + strides[1] = 2; output.stride(strides).device(device) = input; Eigen::DSizes offsets(1,1); @@ -70,7 +72,8 @@ struct BatchMatMul { Output& output, const Device& device) const { typedef Tensor::DimensionPair DimPair; - array dims({{DimPair(1, 0)}}); + array dims; + dims[0] = DimPair(1, 0); for (int i = 0; i < output.dimension(2); ++i) { output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims); } @@ -88,9 +91,10 @@ static void test_custom_binary_op() Tensor result = tensor1.customOp(tensor2, BatchMatMul()); for (int i = 0; i < 5; ++i) { typedef Tensor::DimensionPair DimPair; - array dims({{DimPair(1, 0)}}); + array dims; + dims[0] = DimPair(1, 0); Tensor reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims); - TensorRef> val = result.chip<2>(i); + TensorRef > val = result.chip<2>(i); for (int j = 0; j < 2; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(val(j, k), reference(j, k)); -- cgit v1.2.3 From 8e00ea9a92cfbe849056bc74a1aab34ff8e8a811 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 22:45:10 -0800 Subject: Fixed the coefficient accessors use for the 2d and 3d case when compiling without cxx11 support. --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 4347bc2ff..5c759af09 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -49,7 +49,7 @@ template class TensorMap : public Tensor IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits::size > 1), Layout = PlainObjectType::Layout, - CoordAccess = true, + CoordAccess = true }; EIGEN_DEVICE_FUNC @@ -158,7 +158,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const { if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[0]; + const Index index = i1 + i0 * m_dimensions[1]; return m_data[index]; } else { const Index index = i0 + i1 * m_dimensions[0]; @@ -169,7 +169,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const { if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); return m_data[index]; } else { const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); @@ -245,7 +245,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) { if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[0]; + const Index index = i1 + i0 * m_dimensions[1]; return m_data[index]; } else { const Index index = i0 + i1 * m_dimensions[0]; @@ -256,7 +256,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) { if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); return m_data[index]; } else { const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); -- cgit v1.2.3 From 8d28a161b2f3a8866a7558303514861d2a3b6c69 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 22:53:56 -0800 Subject: Use the proper accessor to refer to the value of a scalar tensor --- unsupported/test/cxx11_tensor_sugar.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index 98671a986..adac472cf 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -18,7 +18,7 @@ static void test_comparison_sugar() { #define TEST_TENSOR_EQUAL(e1, e2) \ b = ((e1) == (e2)).all(); \ - VERIFY(b(0)) + VERIFY(b()) #define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0)) -- cgit v1.2.3 From 2d8f2e4042ed8c347d90fb2dacc53a480f7a28b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 23:20:04 -0800 Subject: Made 2 tests compile without cxx11. HdG: -- --- unsupported/test/cxx11_tensor_casts.cpp | 4 ++-- unsupported/test/cxx11_tensor_reverse.cpp | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp index 729e43327..3c6d0d2ff 100644 --- a/unsupported/test/cxx11_tensor_casts.cpp +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -24,12 +24,12 @@ static void test_simple_cast() cplextensor.setRandom(); chartensor = ftensor.cast(); - cplextensor = ftensor.cast>(); + cplextensor = ftensor.cast >(); for (int i = 0; i < 20; ++i) { for (int j = 0; j < 30; ++j) { VERIFY_IS_EQUAL(chartensor(i,j), static_cast(ftensor(i,j))); - VERIFY_IS_EQUAL(cplextensor(i,j), static_cast>(ftensor(i,j))); + VERIFY_IS_EQUAL(cplextensor(i,j), static_cast >(ftensor(i,j))); } } } diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index f96c21fa3..b35b8d29e 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -114,10 +114,18 @@ static void test_expr_reverse(bool LValue) Tensor result(2,3,5,7); - array src_slice_dim{{2,3,1,7}}; - array src_slice_start{{0,0,0,0}}; - array dst_slice_dim{{2,3,1,7}}; - array dst_slice_start{{0,0,0,0}}; + array src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array dst_slice_dim = src_slice_dim; + array dst_slice_start = src_slice_start; for (int i = 0; i < 5; ++i) { if (LValue) { -- cgit v1.2.3 From 6af52a1227f204e72d9f8473deb3bb648a665149 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Dec 2015 23:31:12 -0800 Subject: Fixed a typo in the constructor of tensors of rank 5. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 6d357545c..87ac8f5aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -78,7 +78,7 @@ class Tensor : public TensorBase0) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, + CoordAccess = true }; static const int Options = Options_; @@ -368,7 +368,7 @@ class Tensor : public TensorBase(dim1, dim2, dim3, dim4, dim5)) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) { EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } -- cgit v1.2.3 From 836da91b3fa6c4b2a2413268effd7e481ec8b066 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 10:06:28 +0100 Subject: Fix unit tests wrt EIGEN_DEFAULT_TO_ROW_MAJOR --- test/is_same_dense.cpp | 11 ++++++----- test/nesting_ops.cpp | 7 ++++--- test/vectorization_logic.cpp | 5 ++++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp index 318ba8717..6d7904bac 100644 --- a/test/is_same_dense.cpp +++ b/test/is_same_dense.cpp @@ -11,9 +11,10 @@ void test_is_same_dense() { - MatrixXd m1(10,10); - Ref ref_m1(m1); - Ref const_ref_m1(m1); + typedef Matrix ColMatrixXd; + ColMatrixXd m1(10,10); + Ref ref_m1(m1); + Ref const_ref_m1(m1); VERIFY(is_same_dense(m1,m1)); VERIFY(is_same_dense(m1,ref_m1)); VERIFY(is_same_dense(const_ref_m1,m1)); @@ -22,9 +23,9 @@ void test_is_same_dense() VERIFY(is_same_dense(m1.block(0,0,m1.rows(),m1.cols()),m1)); VERIFY(!is_same_dense(m1.row(0),m1.col(0))); - Ref const_ref_m1_row(m1.row(1)); + Ref const_ref_m1_row(m1.row(1)); VERIFY(!is_same_dense(m1.row(1),const_ref_m1_row)); - Ref const_ref_m1_col(m1.col(1)); + Ref const_ref_m1_col(m1.col(1)); VERIFY(is_same_dense(m1.col(1),const_ref_m1_col)); } diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp index 76a63400c..2f5025305 100644 --- a/test/nesting_ops.cpp +++ b/test/nesting_ops.cpp @@ -51,6 +51,7 @@ template void run_nesting_ops_2(const MatrixType& _m) Index rows = _m.rows(); Index cols = _m.cols(); MatrixType m1 = MatrixType::Random(rows,cols); + Matrix m2; if((MatrixType::SizeAtCompileTime==Dynamic)) { @@ -79,9 +80,9 @@ template void run_nesting_ops_2(const MatrixType& _m) } VERIFY( verify_eval_type<2>(m1+m1, m1+m1) ); VERIFY( verify_eval_type<3>(m1+m1, m1) ); - VERIFY( verify_eval_type<1>(m1*m1.transpose(), m1) ); - VERIFY( verify_eval_type<1>(m1*(m1+m1).transpose(), m1) ); - VERIFY( verify_eval_type<2>(m1*m1.transpose(), m1) ); + VERIFY( verify_eval_type<1>(m1*m1.transpose(), m2) ); + VERIFY( verify_eval_type<1>(m1*(m1+m1).transpose(), m2) ); + VERIFY( verify_eval_type<2>(m1*m1.transpose(), m2) ); VERIFY( verify_eval_type<1>(m1+m1*m1, m1) ); VERIFY( verify_eval_type<1>(m1.template triangularView().solve(m1), m1) ); diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index da60a2f3a..35fbb9781 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -1,12 +1,15 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud +// Copyright (C) 2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#undef EIGEN_DEFAULT_TO_ROW_MAJOR +#endif #define EIGEN_DEBUG_ASSIGN #include "main.h" #include -- cgit v1.2.3 From 79c1e6d0a63883cec691eaebcdbf0935ad557f70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 10:55:07 +0100 Subject: Fix compilation of MKL support. --- Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 13 ++++++------- Eigen/src/Core/util/DisableStupidWarnings.h | 4 +++- Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 8 ++++---- Eigen/src/Eigenvalues/RealSchur_MKL.h | 6 +++--- Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 8 ++++---- Eigen/src/QR/ColPivHouseholderQR_MKL.h | 4 ++-- 6 files changed, 22 insertions(+), 21 deletions(-) mode change 100644 => 100755 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100644 => 100755 Eigen/src/Core/util/DisableStupidWarnings.h mode change 100644 => 100755 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100644 => 100755 Eigen/src/QR/ColPivHouseholderQR_MKL.h diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100644 new mode 100755 index 86684b66d..a08f385bc --- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h @@ -52,16 +52,16 @@ template { \ static void run( \ Index size, const Scalar* lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \ + const Scalar* _rhs, Scalar* res, Scalar alpha) { \ enum {\ IsColMajor = StorageOrder==ColMajor \ }; \ if (IsColMajor == ConjugateLhs) {\ selfadjoint_matrix_vector_product::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ } else {\ selfadjoint_matrix_vector_product_symv::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ }\ } \ }; \ @@ -79,13 +79,13 @@ typedef Matrix SYMVVector;\ \ static void run( \ Index size, const EIGTYPE* lhs, Index lhsStride, \ -const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \ +const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ { \ enum {\ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ IsLower = UpLo == Lower ? 1 : 0 \ }; \ - MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \ + MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \ MKLTYPE alpha_, beta_; \ const EIGTYPE *x_ptr, myone(1); \ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ @@ -93,10 +93,9 @@ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \ assign_scalar_eig2mkl(beta_, myone); \ SYMVVector x_tmp; \ if (ConjugateRhs) { \ - Map > map_x(_rhs,size,1,InnerStride<>(incx)); \ + Map map_x(_rhs,size,1); \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ - incx=1; \ } else x_ptr=_rhs; \ MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ }\ diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100644 new mode 100755 index 46c141ad5..747232938 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -25,10 +25,12 @@ // typedef that may be a reference type. // 279 - controlling expression is constant // ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case. + // 1684 - conversion from pointer to same-sized integral type (potential portability problem) + // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning push #endif - #pragma warning disable 2196 279 + #pragma warning disable 2196 279 1684 2259 #elif defined __clang__ // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant // this is really a stupid warning as it warns on compile-time expressions involving enums diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100644 new mode 100755 index 27aed923c..931573a4e --- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h +++ b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h @@ -40,9 +40,9 @@ namespace Eigen { /** \internal Specialization for the data types supported by MKL */ #define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \ -template<> inline \ +template<> template inline \ ComplexSchur >& \ -ComplexSchur >::compute(const Matrix& matrix, bool computeU) \ +ComplexSchur >::compute(const EigenBase& matrix, bool computeU) \ { \ typedef Matrix MatrixType; \ typedef MatrixType::RealScalar RealScalar; \ @@ -53,7 +53,7 @@ ComplexSchur >::compute(const Matri m_matUisUptodate = false; \ if(matrix.cols() == 1) \ { \ - m_matT = matrix.cast(); \ + m_matT = matrix.template cast(); \ if(computeU) m_matU = ComplexMatrixType::Identity(1,1); \ m_info = Success; \ m_isInitialized = true; \ @@ -61,7 +61,6 @@ ComplexSchur >::compute(const Matri return *this; \ } \ lapack_int n = matrix.cols(), sdim, info; \ - lapack_int lda = matrix.outerStride(); \ lapack_int matrix_order = MKLCOLROW; \ char jobvs, sort='N'; \ LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \ @@ -69,6 +68,7 @@ ComplexSchur >::compute(const Matri m_matU.resize(n, n); \ lapack_int ldvs = m_matU.outerStride(); \ m_matT = matrix; \ + lapack_int lda = m_matT.outerStride(); \ Matrix w; \ w.resize(n, 1);\ info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \ diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100644 new mode 100755 index c3089b468..e80926400 --- a/Eigen/src/Eigenvalues/RealSchur_MKL.h +++ b/Eigen/src/Eigenvalues/RealSchur_MKL.h @@ -40,14 +40,13 @@ namespace Eigen { /** \internal Specialization for the data types supported by MKL */ #define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \ -template<> inline \ +template<> template inline \ RealSchur >& \ -RealSchur >::compute(const Matrix& matrix, bool computeU) \ +RealSchur >::compute(const EigenBase& matrix, bool computeU) \ { \ eigen_assert(matrix.cols() == matrix.rows()); \ \ lapack_int n = matrix.cols(), sdim, info; \ - lapack_int lda = matrix.outerStride(); \ lapack_int matrix_order = MKLCOLROW; \ char jobvs, sort='N'; \ LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \ @@ -55,6 +54,7 @@ RealSchur >::compute(const Matrix wr, wi; \ wr.resize(n, 1); wi.resize(n, 1); \ info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \ diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100644 new mode 100755 index 17c0dadd2..3499dc78a --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h @@ -40,9 +40,9 @@ namespace Eigen { /** \internal Specialization for the data types supported by MKL */ #define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \ -template<> inline \ +template<> template inline \ SelfAdjointEigenSolver >& \ -SelfAdjointEigenSolver >::compute(const Matrix& matrix, int options) \ +SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ { \ eigen_assert(matrix.cols() == matrix.rows()); \ eigen_assert((options&~(EigVecMask|GenEigMask))==0 \ @@ -56,7 +56,7 @@ SelfAdjointEigenSolver >::compute(c \ if(n==1) \ { \ - m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \ + m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); \ if(computeEigenvectors) m_eivec.setOnes(n,n); \ m_info = Success; \ m_isInitialized = true; \ @@ -64,7 +64,7 @@ SelfAdjointEigenSolver >::compute(c return *this; \ } \ \ - lda = matrix.outerStride(); \ + lda = m_eivec.outerStride(); \ matrix_order=MKLCOLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100644 new mode 100755 index 7b6ba0a5e..fce4df08c --- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h +++ b/Eigen/src/QR/ColPivHouseholderQR_MKL.h @@ -41,10 +41,10 @@ namespace Eigen { /** \internal Specialization for the data types supported by MKL */ #define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \ -template<> inline \ +template<> template inline \ ColPivHouseholderQR >& \ ColPivHouseholderQR >::compute( \ - const Matrix& matrix) \ + const EigenBase& matrix) \ \ { \ using std::abs; \ -- cgit v1.2.3 From 30b5c4cd14bcb9998916e6d782bc3b06465ec510 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 10:59:39 +0100 Subject: Remove useless "explicit", and fix inline/static order. --- Eigen/src/Core/AssignEvaluator.h | 4 ++-- Eigen/src/Core/VectorwiseOp.h | 2 +- Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 2 +- Eigen/src/Eigenvalues/RealQZ.h | 2 +- Eigen/src/SVD/JacobiSVD.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) mode change 100644 => 100755 Eigen/src/Core/AssignEvaluator.h mode change 100644 => 100755 Eigen/src/Core/VectorwiseOp.h mode change 100644 => 100755 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealQZ.h mode change 100644 => 100755 Eigen/src/SVD/JacobiSVD.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100644 new mode 100755 index db3bef38d..9dfffbcc4 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -606,7 +606,7 @@ public: assignPacket(row, col); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index rowIndexByOuterInner(Index outer, Index inner) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::RowsAtCompileTime) == 1 ? 0 @@ -615,7 +615,7 @@ public: : inner; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index colIndexByOuterInner(Index outer, Index inner) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::ColsAtCompileTime) == 1 ? 0 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100644 new mode 100755 index dbc272dae..483f71909 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -115,7 +115,7 @@ struct member_lpnorm { typedef ResultType result_type; template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; - EIGEN_DEVICE_FUNC explicit member_lpnorm() {} + EIGEN_DEVICE_FUNC member_lpnorm() {} template EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const { return mat.template lpNorm

(); } diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100644 new mode 100755 index e2e28cd4a..a9d6790d5 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -145,7 +145,7 @@ template class GeneralizedEigenSolver * * \sa compute() */ - explicit GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true) + GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true) : m_eivec(A.rows(), A.cols()), m_alphas(A.cols()), m_betas(A.cols()), diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100644 new mode 100755 index 02ebb7d17..a62071d42 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -101,7 +101,7 @@ namespace Eigen { * * This constructor calls compute() to compute the QZ decomposition. */ - explicit RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) : + RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) : m_S(A.rows(),A.cols()), m_T(A.rows(),A.cols()), m_Q(A.rows(),A.cols()), diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100644 new mode 100755 index e29d36cf2..cb918860c --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -539,7 +539,7 @@ template class JacobiSVD * according to the specified problem size. * \sa JacobiSVD() */ - explicit JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0) + JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0) { allocate(rows, cols, computationOptions); } -- cgit v1.2.3 From bcb4f126a735e68d7d2dcd08c853a89a31b3440e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 11:11:00 +0100 Subject: Fix compilation of PardisoSupport --- Eigen/PardisoSupport | 2 -- Eigen/src/PardisoSupport/PardisoSupport.h | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) mode change 100644 => 100755 Eigen/PardisoSupport diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport old mode 100644 new mode 100755 index 7dc9c7de0..340edf51f --- a/Eigen/PardisoSupport +++ b/Eigen/PardisoSupport @@ -14,8 +14,6 @@ #include -#include - /** \ingroup Support_modules * \defgroup PardisoSupport_Module PardisoSupport module * diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 9c18eb9b9..7c238ce3c 100755 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -117,7 +117,9 @@ class PardisoImpl : public SparseSolverBase typedef Matrix IntColVectorType; typedef Array ParameterType; enum { - ScalarIsComplex = NumTraits::IsComplex + ScalarIsComplex = NumTraits::IsComplex, + ColsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic }; PardisoImpl() -- cgit v1.2.3 From 7385e6e2ef944a4be9464760066ab072ed315e1c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 11:11:19 +0100 Subject: Remove useless explicit --- Eigen/src/SparseLU/SparseLU.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100644 => 100755 Eigen/src/SparseLU/SparseLU.h diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100644 new mode 100755 index acd3ad100..d33d27f46 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -101,7 +101,8 @@ class SparseLU : public SparseSolverBase >, { initperfvalues(); } - explicit SparseLU(const MatrixType& matrix):m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1) + explicit SparseLU(const MatrixType& matrix) + : m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1) { initperfvalues(); compute(matrix); @@ -719,7 +720,7 @@ template struct SparseLUMatrixUReturnType : internal::no_assignment_operator { typedef typename MatrixLType::Scalar Scalar; - explicit SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) + SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL),m_mapU(mapU) { } Index rows() { return m_mapL.rows(); } -- cgit v1.2.3 From 4519fd5d40031839ca8a9de4cc177bcbda95e360 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 11:11:38 +0100 Subject: Fix MKL compilation issue --- Eigen/src/QR/ColPivHouseholderQR_MKL.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h index fce4df08c..1203d0d36 100755 --- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h +++ b/Eigen/src/QR/ColPivHouseholderQR_MKL.h @@ -52,9 +52,9 @@ ColPivHouseholderQR Date: Fri, 11 Dec 2015 11:43:49 +0100 Subject: bug #1132: add EIGEN_MAPBASE_PLUGIN --- Eigen/src/Core/MapBase.h | 4 ++++ doc/PreprocessorDirectives.dox | 1 + 2 files changed, 5 insertions(+) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index ae28d4db6..75a80daaa 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -155,6 +155,10 @@ template class MapBase checkSanity(); } + #ifdef EIGEN_MAPBASE_PLUGIN + #include EIGEN_MAPBASE_PLUGIN + #endif + protected: EIGEN_DEVICE_FUNC diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index 76ce2eb99..7cde1a36f 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -106,6 +106,7 @@ following macros are supported; none of them are defined by default. - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class. - \b EIGEN_MATRIXBASE_PLUGIN - filename of plugin for extending the MatrixBase class. - \b EIGEN_PLAINOBJECTBASE_PLUGIN - filename of plugin for extending the PlainObjectBase class. + - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class. - \b EIGEN_QUATERNION_PLUGIN - filename of plugin for extending the Quaternion class. - \b EIGEN_QUATERNIONBASE_PLUGIN - filename of plugin for extending the QuaternionBase class. - \b EIGEN_SPARSEMATRIX_PLUGIN - filename of plugin for extending the SparseMatrix class. -- cgit v1.2.3 From b60a8967f549250c2701112a029853e2d2d8ae64 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 11:59:11 +0100 Subject: bug #1134: fix JacobiSVD pre-allocation (grafted from f22036f5f8bbaa349e090327d246c817bac94ee4 ) --- Eigen/src/SVD/JacobiSVD.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index e29d36cf2..da1dbf5e7 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -666,7 +666,7 @@ void JacobiSVD::allocate(Index rows, Index cols, u if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); - if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols); + if(m_rows!=m_cols) m_scaledMatrix.resize(rows,cols); } template -- cgit v1.2.3 From 4d708457d07c0b9169d9e8336bb4d090791acd5d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 23:07:22 +0100 Subject: Increase axpy vector size --- bench/btl/generic_bench/bench_parameter.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 0f62bd421..2b01149f9 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -29,7 +29,7 @@ // min vector size for axpy bench #define MIN_AXPY 5 // max vector size for axpy bench -#define MAX_AXPY 1000000 +#define MAX_AXPY 3000000 // min matrix size for matrix vector product bench #define MIN_MV 5 // max matrix size for matrix vector product bench -- cgit v1.2.3 From 4483c0fdf6bbab9133211b116a53271e5dcdeec5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 23:29:53 +0100 Subject: Fix unused variable warning. --- Eigen/src/Core/SpecialFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 05973e372..d43cf23a1 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -21,7 +21,7 @@ template struct lgamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + static EIGEN_STRONG_INLINE Scalar run(const Scalar&) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); @@ -59,7 +59,7 @@ template struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + static EIGEN_STRONG_INLINE Scalar run(const Scalar&) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); @@ -97,7 +97,7 @@ template struct erfc_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar& x) + static EIGEN_STRONG_INLINE Scalar run(const Scalar&) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); -- cgit v1.2.3 From 140f3a02a825e9c3d72f1adf7ff3cc2e49dffea9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Dec 2015 23:31:21 +0100 Subject: Fix MKL wrapper for ComplexSchur --- Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h index 931573a4e..e20c3725b 100755 --- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h +++ b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h @@ -53,7 +53,7 @@ ComplexSchur >::compute(const Eigen m_matUisUptodate = false; \ if(matrix.cols() == 1) \ { \ - m_matT = matrix.template cast(); \ + m_matT = matrix.derived().template cast(); \ if(computeU) m_matU = ComplexMatrixType::Identity(1,1); \ m_info = Success; \ m_isInitialized = true; \ -- cgit v1.2.3 From 75e19fc7cabfdb447dc740ee65399089f189e1fe Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Dec 2015 15:12:55 -0800 Subject: Marked the tensor constructors as EIGEN_DEVICE_FUNC: This makes it possible to call them from a CUDA kernel. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 87ac8f5aa..ad525bac8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -340,34 +340,34 @@ class Tensor : public TensorBase - inline Tensor(Index firstDimension, IndexTypes... otherDimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #else - inline explicit Tensor(Index dim1) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) : m_storage(dim1, array(dim1)) { EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - inline explicit Tensor(Index dim1, Index dim2) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2) : m_storage(dim1*dim2, array(dim1, dim2)) { EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - inline explicit Tensor(Index dim1, Index dim2, Index dim3) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3) : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) { EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) { EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) { EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -375,7 +375,7 @@ class Tensor : public TensorBase& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED -- cgit v1.2.3 From 17352e27928ba74c2b4131f5905f9d90ace805b2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Dec 2015 15:20:31 -0800 Subject: Made the entire TensorFixedSize api callable from a CUDA kernel. --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index bf930f6b8..a4d6ce6b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -68,7 +68,7 @@ class TensorFixedSize : public TensorBase - inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -100,7 +100,7 @@ class TensorFixedSize : public TensorBase - inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -132,7 +132,7 @@ class TensorFixedSize : public TensorBase - inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -171,7 +171,7 @@ class TensorFixedSize : public TensorBase - inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -221,7 +221,7 @@ class TensorFixedSize : public TensorBase Date: Tue, 15 Dec 2015 11:34:52 +0100 Subject: bug #1136: Protect isinf for Intel compilers. Also don't distinguish GCC from ICC and don't rely on EIGEN_NOT_A_MACRO, which might not be defined when including this. --- unsupported/test/mpreal/mpreal.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h index c4f6cf0cb..9b0cf7268 100644 --- a/unsupported/test/mpreal/mpreal.h +++ b/unsupported/test/mpreal/mpreal.h @@ -72,14 +72,12 @@ #define MPREAL_VERSION_STRING "3.6.2" // Detect compiler using signatures from http://predef.sourceforge.net/ -#if defined(__GNUC__) && defined(__INTEL_COMPILER) - #define IsInf(x) isinf(x) // Intel ICC compiler on Linux - +#if defined(__GNUC__) + #define IsInf(x) (isinf)(x) // GNU C++/Intel ICC compiler on Linux #elif defined(_MSC_VER) // Microsoft Visual C++ #define IsInf(x) (!_finite(x)) - #else - #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO (x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance + #define IsInf(x) (std::isinf)(x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance #endif // A Clang feature extension to determine compiler features. @@ -3103,4 +3101,4 @@ namespace std } -#endif /* __MPREAL_H__ */ \ No newline at end of file +#endif /* __MPREAL_H__ */ -- cgit v1.2.3 From 35d8725c73cfcce45ebb774e25e51bd5ab5e61b7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Dec 2015 10:14:24 +0100 Subject: Disable AutoDiffScalar generic copy ctor for non compatible scalar types (fix ambiguous template instantiation) --- unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) mode change 100644 => 100755 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h old mode 100644 new mode 100755 index 8b58b512b..e30ad5b6d --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -99,7 +99,11 @@ class AutoDiffScalar {} template - AutoDiffScalar(const AutoDiffScalar& other) + AutoDiffScalar(const AutoDiffScalar& other +#ifndef EIGEN_PARSED_BY_DOXYGEN + , typename internal::enable_if::value,void*>::type = 0 +#endif + ) : m_value(other.value()), m_derivatives(other.derivatives()) {} @@ -127,6 +131,14 @@ class AutoDiffScalar return *this; } + inline AutoDiffScalar& operator=(const Scalar& other) + { + m_value = other; + if(m_derivatives.size()>0) + m_derivatives.setZero(); + return *this; + } + // inline operator const Scalar& () const { return m_value; } // inline operator Scalar& () { return m_value; } -- cgit v1.2.3 From ae8b217a01f07711aa2e57f6b3cf93da77d6d82a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Dec 2015 10:47:03 +0100 Subject: Update doc to make it clear that only SuperLU 4.x is supported --- Eigen/SuperLUSupport | 2 ++ Eigen/src/SuperLUSupport/SuperLUSupport.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport index 0ae9f3fdf..113f58ee5 100644 --- a/Eigen/SuperLUSupport +++ b/Eigen/SuperLUSupport @@ -43,6 +43,8 @@ namespace Eigen { struct SluMatrix; } * - class SuperLU: a supernodal sequential LU factorization. * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods). * + * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported. + * * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting. * * \code diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index b20da37f7..fd2b26581 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -452,6 +452,8 @@ class SuperLUBase : public SparseSolverBase * * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * + * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported. + * * \implsparsesolverconcept * * \sa \ref TutorialSparseDirectSolvers @@ -801,7 +803,7 @@ typename SuperLU::Scalar SuperLU::determinant() const * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers. * - * \warning This class requires SuperLU 4 or later. + * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported. * * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * -- cgit v1.2.3 From 49d96aee6448d67edbb0382fefca746304c5baaa Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 16 Dec 2015 11:37:16 +0100 Subject: bug #1120: Make sure that SuperLU version is checked --- cmake/FindSuperLU.cmake | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cmake/FindSuperLU.cmake b/cmake/FindSuperLU.cmake index 259ed7320..e4142fe4d 100644 --- a/cmake/FindSuperLU.cmake +++ b/cmake/FindSuperLU.cmake @@ -60,11 +60,21 @@ endif() cmake_pop_check_state() +if(SuperLU_FIND_VERSION) + if(${SUPERLU_VERSION_VAR} VERSION_LESS ${SuperLU_FIND_VERSION}) + set(SUPERLU_VERSION_OK FALSE) + else() + set(SUPERLU_VERSION_OK TRUE) + endif() +else() + set(SUPERLU_VERSION_OK TRUE) +endif() + endif() include(FindPackageHandleStandardArgs) find_package_handle_standard_args(SUPERLU - REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES + REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK VERSION_VAR SUPERLU_VERSION_VAR) mark_as_advanced(SUPERLU_INCLUDES SUPERLU_LIBRARIES) -- cgit v1.2.3 From 9f9de1aaa9a508fc6c94ddacd12b9107462f688f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Dec 2015 21:48:48 +0100 Subject: bump to 3.3-beta1 --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e0bc1689d..9b4f8faa7 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -13,7 +13,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 2 -#define EIGEN_MINOR_VERSION 91 +#define EIGEN_MINOR_VERSION 92 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ -- cgit v1.2.3 -- cgit v1.2.3 From 2ca55a3ae45e1b5137c94267274465bf509f7c72 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 16 Dec 2015 20:45:58 -0800 Subject: Fixed some compilation error triggered by the tensor code with msvc 2008 --- unsupported/Eigen/CXX11/Tensor | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c681d3c20..7481a9ddb 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -28,14 +28,22 @@ #include #include + +#ifdef _WIN32 +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else #include +#endif #if __cplusplus > 199711 #include #endif #ifdef _WIN32 -#include +#include #elif defined(__APPLE__) #include #else -- cgit v1.2.3 From 40e6250fc3737ff76224b04c94c2de3ce0d51607 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Dec 2015 13:29:08 -0800 Subject: Made it possible to run tensor chipping operations on CUDA devices --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c9fa39e51..abc3c92ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,7 +50,7 @@ struct nested, 1, typename eval struct DimensionId { - DimensionId(DenseIndex dim) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -60,7 +60,7 @@ struct DimensionId template <> struct DimensionId { - DimensionId(DenseIndex dim) : actual_dim(dim) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { eigen_assert(dim >= 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { -- cgit v1.2.3 From 4aac55f684d9bd36b5f855fa5a8c2f17ca3094c9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Dec 2015 13:39:01 -0800 Subject: Silenced some compilation warnings triggered by nvcc --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 7 +++++-- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 ++++++++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 +++--- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index c76d1ee3f..4d7570077 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -238,11 +238,14 @@ struct GpuDevice { }; - +#ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); - +#else +#define LAUNCH_CUDA_KERNEL(...) \ + eigen_assert(false && "Cannot launch a kernel from another kernel"); +#endif // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93e1de1b..c28078882 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,8 +213,9 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { +#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -227,13 +228,17 @@ inline void TensorExecutor::run(const Expression& LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); +#else + eigen_assert(false && "Cannot launch a kernel from another kernel"); +#endif } /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { +#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -246,6 +251,9 @@ inline void TensorExecutor::run(const Expression& e LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); +#else + eigen_assert(false && "Cannot launch a kernel from another kernel"); +#endif } #endif // __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index bd15295b8..aaa877185 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -454,7 +454,7 @@ struct TensorEvaluator, Device> input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; } } - + int outputIndex = 0; int reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { @@ -473,13 +473,13 @@ struct TensorEvaluator, Device> m_preservedStrides[0] = internal::array_prod(input_dims); } } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } typedef typename internal::remove_const::type CoeffReturnType; typedef typename internal::remove_const::type PacketReturnType; - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. -- cgit v1.2.3 From 8dd17cbe80ef460e9fbd562d6de6ae19b264caea Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Dec 2015 14:00:33 -0800 Subject: Fixed a clang compilation warning triggered by the use of arrays of size 0. --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 785321666..f28a9699d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -101,13 +101,18 @@ bool operator!=(const Tuple& x, const Tuple& y) { #ifdef EIGEN_HAS_SFINAE -namespace internal{ +namespace internal { template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { idx[Is]... }; } + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType&, numeric_list) { + return array(); + } /** Make an array (for index/dimensions) out of a custom index */ template -- cgit v1.2.3 From 3abd8470caf60473851f0c5b40ed8abff5c03931 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Dec 2015 14:18:59 +0100 Subject: bug #1140: remove custom definition and use of _mm256_setr_m128 --- Eigen/src/Core/arch/AVX/MathFunctions.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index c4bd6bd53..9ced9b717 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,11 +10,6 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -// For some reason, this function didn't make it into the avxintirn.h -// used by the compiler, so we'll just wrap it. -#define _mm256_setr_m128(lo, hi) \ - _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) - /* The sin, cos, exp, and log functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -63,7 +58,7 @@ psin(const Packet8f& _x) { _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31); __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31); - Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi); + Packet8i sign_flip_mask = _mm256_setr_m128(hi, lo); #endif // Create a mask for which interpolant to use, i.e. if z > 1, then the mask @@ -149,7 +144,7 @@ plog(const Packet8f& _x) { #else __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 0), 23); __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 1), 23); - Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi)); + Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_set_m128(hi,lo)); #endif Packet8f e = _mm256_sub_ps(emm0, p8f_126f); @@ -264,7 +259,7 @@ pexp(const Packet8f& _x) { #else __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23); __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23); - emm0 = _mm256_setr_m128(lo, hi); + emm0 = _mm256_set_m128(hi,lo); #endif // Return 2^m * exp(r). -- cgit v1.2.3 From 75a7fa1919af749ba79a2b70c542320707837f61 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Dec 2015 14:07:31 -0800 Subject: Doubled the speed of full reductions on GPUs. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 49102fca2..af1b9432c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -87,15 +87,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } typename Self::CoeffReturnType accum = reducer.initialize(); - for (Index i = 0; i < NumPerThread; ++i) { - const Index index = first_index + i * BlockSize; - if (index >= num_coeffs) { - break; - } + Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread*BlockSize); + for (Index i = 0; i < max_iter; i+=BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); typename Self::CoeffReturnType val = input.m_impl.coeff(index); reducer.reduce(val, &accum); } +#pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { reducer.reduce(__shfl_down(accum, offset), &accum); } -- cgit v1.2.3 From 1b829695598a823fe3d9132d35ccdbb6e176c47e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Dec 2015 14:36:35 -0800 Subject: Add alignment requirement for local buffer used by the slicing op. --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index bdc86e0fa..d8c923d74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -443,7 +443,7 @@ struct TensorEvaluator, Devi return rslt; } else { - typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); for (int i = 1; i < packetSize-1; ++i) { -- cgit v1.2.3 From 6d777e1bc7d31023ad78c84777847896ab31927d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Dec 2015 19:25:50 -0800 Subject: Fixed a typo. --- Eigen/src/Core/arch/AVX/MathFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 9ced9b717..7baf57eca 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -58,7 +58,7 @@ psin(const Packet8f& _x) { _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31); __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31); - Packet8i sign_flip_mask = _mm256_setr_m128(hi, lo); + Packet8i sign_flip_mask = _mm256_set_m128(hi, lo); #endif // Create a mask for which interpolant to use, i.e. if z > 1, then the mask -- cgit v1.2.3 From 95dd423cca316e618e712038cbe4d5abf11a1da4 Mon Sep 17 00:00:00 2001 From: connor-k Date: Mon, 21 Dec 2015 01:12:26 +0000 Subject: [doc] Remove extra ';' in Tutorial_AdvancedInitialization_Join.cpp --- doc/snippets/Tutorial_AdvancedInitialization_Join.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp index 84e8715cb..55a21539d 100644 --- a/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp +++ b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp @@ -3,7 +3,7 @@ vec1 << 1, 2, 3; std::cout << "vec1 = " << vec1 << std::endl; RowVectorXd vec2(4); -vec2 << 1, 4, 9, 16;; +vec2 << 1, 4, 9, 16; std::cout << "vec2 = " << vec2 << std::endl; RowVectorXd joined(7); -- cgit v1.2.3 From 51be91f15e745fbcc7b1b3584b2d0b947a500272 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Dec 2015 08:42:58 -0800 Subject: Added support for CUDA architectures that don's support for 3.5 capabilities --- Eigen/src/Core/arch/CUDA/PacketMath.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index cb1b547e0..4495b3741 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -183,25 +183,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to to[1] = from.y; } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const double2*)from); +#else + return make_float2(from[0], from[1]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); -} +#else + return make_float2(from[0], from[1]); #endif +} template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); -- cgit v1.2.3 From a6c243617bc5ac3e806e7fd756b0c18b2e33d3c1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Dec 2015 09:05:45 -0800 Subject: Fixed a typo in previous change. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 4495b3741..9d5773106 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -196,7 +196,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const double2*)from); #else - return make_float2(from[0], from[1]); + return make_double2(from[0], from[1]); #endif } @@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(cons #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); #else - return make_float2(from[0], from[1]); + return make_double2(from[0], from[1]); #endif } -- cgit v1.2.3 From 1c3e78319d9de0b1480a05f75a0d9dde339b57de Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Dec 2015 15:05:01 -0800 Subject: Added missing const --- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 65fd25a2e..c16bf7e67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> } typedef TensorEvalToOp EvalTo; EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; + const bool PacketAccess = internal::IsVectorizable::value; internal::TensorExecutor::run(evalToTmp, m_device); m_impl.cleanup(); return true; -- cgit v1.2.3 From 3504ae47ca7a2a712155c48e13bb4168cdebd5a6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Dec 2015 15:20:06 -0800 Subject: Made it possible to run the lgamma, erf, and erfc functors on a CUDA gpu. --- Eigen/src/Core/functors/UnaryFunctors.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 6891cfdda..01727f250 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -415,7 +415,7 @@ template struct scalar_lgamma_op { using numext::lgamma; return lgamma(a); } typedef typename packet_traits::type Packet; - inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } }; template struct functor_traits > @@ -438,7 +438,7 @@ template struct scalar_erf_op { using numext::erf; return erf(a); } typedef typename packet_traits::type Packet; - inline Packet packetOp(const Packet& a) const { return internal::perf(a); } + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); } }; template struct functor_traits > @@ -461,7 +461,7 @@ template struct scalar_erfc_op { using numext::erfc; return erfc(a); } typedef typename packet_traits::type Packet; - inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } }; template struct functor_traits > @@ -732,10 +732,10 @@ struct functor_traits > { * \sa class CwiseUnaryOp, Cwise::sign() */ template::IsComplex!=0) > struct scalar_sign_op; -template +template struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar( (a>Scalar(0)) - (a { //template //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; -template +template struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::abs; typedef typename NumTraits::Real real_type; real_type aa = abs(a); if (aa==0) - return Scalar(0); - aa = 1./aa; + return Scalar(0); + aa = 1./aa; return Scalar(real(a)*aa, imag(a)*aa ); } //TODO -- cgit v1.2.3 From b5d2078c4a9cdb81416586cca5658e38b059148d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 15:06:17 -0800 Subject: Optimized outer reduction on GPUs. --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 39 +++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 54 ++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index aaa877185..c30980a49 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -337,9 +337,23 @@ struct FullReducer { #endif +// Default outer reducer +template +struct OuterReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + assert(false && "Not implemented"); + } +}; + + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + +template +__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif } // end namespace internal @@ -439,7 +453,7 @@ struct TensorEvaluator, Device> } } } - + // Precompute input strides. if (NumInputDims > 0) { array input_strides; @@ -498,6 +512,28 @@ struct TensorEvaluator, Device> internal::FullReducer::run(*this, reducer, m_device, data); return need_assign; } + + // Attempt to use an optimized reduction. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reducedDims[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && + preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + Op reducer(m_reducer); + internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return false; + } + } +#endif return true; } @@ -579,6 +615,7 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif // Returns the Index in the input tensor of the first value that needs to be diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index af1b9432c..f0e9d528e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -131,6 +131,60 @@ struct FullReducer { } }; + +template +__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + // Initialize the output values + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + + // Do the reduction. + const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + for (Index i = thread_id; i < max_iter; i += num_threads) { + const Index input_col = i % num_preserved_coeffs; + const Index input_row = (i / num_preserved_coeffs) * NumPerThread; + typename Self::CoeffReturnType reduced_val = reducer.initialize(); + const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); + for (Index j = input_row; j < max_row; j++) { + typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); + reducer.reduce(val, &reduced_val); + } + atomicReduce(&(output[input_col]), reduced_val, reducer); + } +} + + +template +struct OuterReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same::value; + + template + static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce floats on a gpu device"); + } + + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 16; + const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + + LAUNCH_CUDA_KERNEL((OuterReductionKernel), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + } +}; + #endif -- cgit v1.2.3 From e7e6d0181061c735fd4d69b1091f0b407e383aac Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 15:07:33 -0800 Subject: Made sure the optimized gpu reduction code is actually compiled. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index f0e9d528e..5d9205b59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H namespace Eigen { namespace internal { @@ -191,4 +191,4 @@ struct OuterReducer { } // end namespace internal } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -- cgit v1.2.3 From 9c7d96697b4e21960d679b7be8d5514a22fd80ab Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 16:11:07 -0800 Subject: Added missing define --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 5d9205b59..20dc72e85 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -131,6 +131,7 @@ struct FullReducer { } }; +#define DIVUP(x, y) (((x) + (y)-1) / (y)) template @@ -185,6 +186,8 @@ struct OuterReducer { } }; +#undef DIVUP + #endif -- cgit v1.2.3 From a1e08fb2a55bf60c81de1687f825d0c3d4e62d22 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 16:30:10 -0800 Subject: Optimized the configuration of the outer reduction cuda kernel --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 20dc72e85..8e250867c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -179,7 +179,10 @@ struct OuterReducer { const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); -- cgit v1.2.3 From bdcbc66a5cca656e1cbdfa97668f3e400a7cb08d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 17:51:55 -0800 Subject: Don't attempt to vectorize mean reductions of integers since we can't use SSE or AVX instructions to divide 2 integers. --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 34ba4e392..f94ffa020 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -93,7 +93,7 @@ template struct SumReducer template struct MeanReducer { - static const bool PacketAccess = true; + static const bool PacketAccess = !NumTraits::IsInteger; static const bool IsStateful = true; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -- cgit v1.2.3 From d2e288ae505172fea73e463a09fa9ebb78763a3e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Dec 2015 16:53:43 +0100 Subject: Workaround compilers that do not even define _mm256_set_m128. --- Eigen/src/Core/arch/AVX/MathFunctions.h | 53 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 7baf57eca..b0e0222a4 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -18,6 +18,28 @@ namespace Eigen { namespace internal { +inline Packet8i pshiftleft(Packet8i v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(v, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +inline Packet8f pshiftright(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + // Sine function // Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and // evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants @@ -49,17 +71,8 @@ psin(const Packet8f& _x) { // Make a mask for the entries that need flipping, i.e. wherever the shift // is odd. Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = - _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); -#ifdef EIGEN_VECTORIZE_AVX2 - Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31); -#else - __m128i lo = - _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31); - __m128i hi = - _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31); - Packet8i sign_flip_mask = _mm256_set_m128(hi, lo); -#endif + Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); + Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); // Create a mask for which interpolant to use, i.e. if z > 1, then the mask // is set to ones for that entry. @@ -137,15 +150,7 @@ plog(const Packet8f& _x) { // Truncate input values to the minimum positive normal. x = pmax(x, p8f_min_norm_pos); -// Extract the shifted exponents (No bitwise shifting in regular AVX, so -// convert to SSE and do it there). -#ifdef EIGEN_VECTORIZE_AVX2 - Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(x), 23)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 0), 23); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 1), 23); - Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_set_m128(hi,lo)); -#endif + Packet8f emm0 = pshiftright(x,23); Packet8f e = _mm256_sub_ps(emm0, p8f_126f); // Set the exponents to -1, i.e. x are in the range [0.5,1). @@ -254,13 +259,7 @@ pexp(const Packet8f& _x) { // Build emm0 = 2^m. Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); -#ifdef EIGEN_VECTORIZE_AVX2 - emm0 = _mm256_slli_epi32(emm0, 23); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23); - emm0 = _mm256_set_m128(hi,lo); -#endif + emm0 = pshiftleft(emm0, 23); // Return 2^m * exp(r). return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); -- cgit v1.2.3 From f7362772e3236cdb8ae4d9be175f83a0b19902a0 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 24 Dec 2015 21:15:38 -0800 Subject: Add digamma for CPU + CUDA. Includes tests. --- Eigen/src/Core/GenericPacketMath.h | 5 + Eigen/src/Core/GlobalFunctions.h | 1 + Eigen/src/Core/SpecialFunctions.h | 426 ++++++++++++++++++++---- Eigen/src/Core/arch/CUDA/MathFunctions.h | 14 + Eigen/src/Core/arch/CUDA/PacketMath.h | 2 + Eigen/src/Core/functors/UnaryFunctors.h | 22 ++ Eigen/src/plugins/ArrayCwiseUnaryOps.h | 14 + test/array.cpp | 19 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 + 9 files changed, 447 insertions(+), 62 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8ad51bad5..4c7d1d848 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -75,6 +75,7 @@ struct default_packet_traits HasCosh = 0, HasTanh = 0, HasLGamma = 0, + HasDiGamma = 0, HasErf = 0, HasErfc = 0, @@ -439,6 +440,10 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + /** \internal \returns the erf(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perf(const Packet& a) { using numext::erf; return erf(a); } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 62fec7008..396da8e71 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -50,6 +50,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index d43cf23a1..9fff3d74b 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -13,79 +13,390 @@ namespace Eigen { namespace internal { +namespace cephes { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct polevl { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_STATIC_ASSERT(N > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + return polevl::run(x, coef) * x + coef[N]; + } +}; + +template +struct polevl { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar run(const Scalar, const Scalar coef[]) { + return coef[0]; + } +}; + +} // end namespace cephes + /**************************************************************************** * Implementation of lgamma * ****************************************************************************/ -template -struct lgamma_impl -{ +template +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template +struct lgamma_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template <> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); } +}; + +template <> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); } +}; +#endif + +/**************************************************************************** + * Implementation of digamma (psi) * + ****************************************************************************/ + +template +struct digamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct lgamma_retval -{ +template +struct digamma_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct lgamma_impl -{ +template <> +struct digamma_impl { + /* + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * float x, y, psif(); + * + * y = psif( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY: + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1984, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); } + static float run(float xx) { + float p, q, nz, x, s, w, y, z; + bool negative; + + // Some necessary constants + const float PIF = 3.141592653589793238; + const float MAXNUMF = std::numeric_limits::infinity(); + + const float A[] = { + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + x = xx; + nz = 0.0f; + negative = 0; + if (x <= 0.0f) { + negative = 1; + q = x; + p = ::floor(q); + if (p == q) { + return (MAXNUMF); + } + nz = q - p; + if (nz != 0.5f) { + if (nz > 0.5f) { + p += 1.0f; + nz = q - p; + } + nz = PIF / ::tan(PIF * nz); + } else { + nz = 0.0f; + } + x = 1.0f - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = 0.0f; + while (s < 10.0f) { + w += 1.0f / s; + s += 1.0f; + } + + if (s < 1.0e8f) { + z = 1.0f / (s * s); + y = z * cephes::polevl::run(z, A); + } else + y = 0.0f; + + y = ::log(s) - (0.5f / s) - y - w; + + return (negative) ? y - nz : y; + } }; -template<> -struct lgamma_impl -{ +template <> +struct digamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); } + static double run(double x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY: + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + /* + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ + double p, q, nz, s, w, y, z; + bool negative; + + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + const double MAXNUM = std::numeric_limits::infinity(); + const double PI = 3.14159265358979323846; + + negative = 0; + nz = 0.0; + + if (x <= 0.0) { + negative = 1; + q = x; + p = ::floor(q); + if (p == q) { + return MAXNUM; + } + /* Remove the zeros of tan(PI x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != 0.5) { + if (nz > 0.5) { + p += 1.0; + nz = q - p; + } + nz = PI / ::tan(PI * nz); + } + else { + nz = 0.0; + } + x = 1.0 - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = 0.0; + while (s < 10.0) { + w += 1.0 / s; + s += 1.0; + } + + if (s < 1.0e17) { + z = 1.0 / (s * s); + y = z * cephes::polevl::run(z, A); + } + else + y = 0.0; + + y = ::log(s) - (0.5 / s) - y - w; + + return (negative) ? y - nz : y; + } }; + #endif /**************************************************************************** * Implementation of erf * ****************************************************************************/ -template -struct erf_impl -{ +template +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct erf_retval -{ +template +struct erf_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct erf_impl -{ +template <> +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); } + static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); } }; -template<> -struct erf_impl -{ +template <> +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); } + static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); } }; #endif // EIGEN_HAS_C99_MATH @@ -93,35 +404,30 @@ struct erf_impl * Implementation of erfc * ****************************************************************************/ -template -struct erfc_impl -{ +template +struct erfc_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct erfc_retval -{ +template +struct erfc_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct erfc_impl -{ +template <> +struct erfc_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } }; -template<> -struct erfc_impl -{ +template <> +struct erfc_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } }; @@ -129,27 +435,29 @@ struct erfc_impl } // end namespace internal - namespace numext { -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); } -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); } -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); } diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index ecd5c444e..a2c06a817 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -78,6 +78,20 @@ double2 plgamma(const double2& a) return make_double2(lgamma(a.x), lgamma(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perf(const float4& a) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 9d5773106..d3d9f910e 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -40,6 +40,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, HasLGamma = 1, + HasDiGamma = 1, HasErf = 1, HasErfc = 1, @@ -63,6 +64,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, HasLGamma = 1, + HasDiGamma = 1, HasErf = 1, HasErfc = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 01727f250..897ab04ba 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -427,6 +427,28 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasDiGamma + }; +}; + /** \internal * \brief Template functor to compute the Gauss error function of a * scalar diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 01432e2f3..e818ac588 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -22,6 +22,7 @@ typedef CwiseUnaryOp, const Derived> TanhReturn typedef CwiseUnaryOp, const Derived> SinhReturnType; typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> LgammaReturnType; +typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; typedef CwiseUnaryOp, const Derived> PowReturnType; @@ -318,6 +319,19 @@ lgamma() const return LgammaReturnType(derived()); } +/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma). + * + * Example: \include Cwise_digamma.cpp + * Output: \verbinclude Cwise_digamma.out + * + * \sa cos(), sin(), tan() + */ +inline const DigammaReturnType +digamma() const +{ + return DigammaReturnType(derived()); +} + /** \returns an expression of the coefficient-wise Gauss error * function of *this. * diff --git a/test/array.cpp b/test/array.cpp index 6adedfb06..9366b73fd 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -219,6 +219,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); #ifdef EIGEN_HAS_C99_MATH VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); + VERIFY_IS_APPROX(m1.digamma(), digamma(m1)); VERIFY_IS_APPROX(m1.erf(), erf(m1)); VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); #endif // EIGEN_HAS_C99_MATH @@ -309,7 +310,20 @@ template void array_real(const ArrayType& m) s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); - + + // check special functions (comparing against numpy implementation) + if (!NumTraits::IsComplex) { + VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329)); + VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645)); + VERIFY_IS_APPROX(numext::digamma(Scalar(4)), RealScalar(1.2561176684318)); + VERIFY_IS_APPROX(numext::digamma(Scalar(-10.5)), RealScalar(2.398239129535781)); + VERIFY_IS_APPROX(numext::digamma(Scalar(10000.5)), RealScalar(9.210340372392849)); + VERIFY_IS_EQUAL(numext::digamma(Scalar(0)), + std::numeric_limits::infinity()); + VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), + std::numeric_limits::infinity()); + } + // check inplace transpose m3 = m1; m3.transposeInPlace(); @@ -336,8 +350,6 @@ template void array_complex(const ArrayType& m) Array m3(rows, cols); - Scalar s1 = internal::random(); - for (Index i = 0; i < m.rows(); ++i) for (Index j = 0; j < m.cols(); ++j) m2(i,j) = sqrt(m1(i,j)); @@ -410,6 +422,7 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1); // scalar by array division + Scalar s1 = internal::random(); const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 392acf302..cca716d6f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -128,6 +128,12 @@ class TensorBase return unaryExpr(internal::scalar_lgamma_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> erf() const { -- cgit v1.2.3 From 14897600b77ce8400780f0f34a7bb3661ce5db62 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 24 Dec 2015 21:28:18 -0800 Subject: Protect digamma tests behind a EIGEN_HAS_C99_MATH check. --- test/array.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/array.cpp b/test/array.cpp index 9366b73fd..96aef31c7 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -312,6 +312,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); // check special functions (comparing against numpy implementation) +#ifdef EIGEN_HAS_C99_MATH if (!NumTraits::IsComplex) { VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329)); VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645)); @@ -323,6 +324,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); } +#endif // EIGEN_HAS_C99_MATH // check inplace transpose m3 = m1; -- cgit v1.2.3 From afb35385bf565d3ddaee50b1da5b664422818934 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 28 Dec 2015 17:34:06 -0800 Subject: Change PI* to M_PI* in SpecialFunctions to avoid possible breakage with external DEFINEs. --- Eigen/src/Core/SpecialFunctions.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 9fff3d74b..a5f9cb62a 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -189,7 +189,7 @@ struct digamma_impl { bool negative; // Some necessary constants - const float PIF = 3.141592653589793238; + const float M_PIF = 3.141592653589793238; const float MAXNUMF = std::numeric_limits::infinity(); const float A[] = { @@ -215,7 +215,7 @@ struct digamma_impl { p += 1.0f; nz = q - p; } - nz = PIF / ::tan(PIF * nz); + nz = M_PIF / ::tan(M_PIF * nz); } else { nz = 0.0f; } @@ -315,7 +315,7 @@ struct digamma_impl { }; const double MAXNUM = std::numeric_limits::infinity(); - const double PI = 3.14159265358979323846; + const double M_PI = 3.14159265358979323846; negative = 0; nz = 0.0; @@ -327,7 +327,7 @@ struct digamma_impl { if (p == q) { return MAXNUM; } - /* Remove the zeros of tan(PI x) + /* Remove the zeros of tan(M_PI x) * by subtracting the nearest integer from x */ nz = q - p; @@ -336,7 +336,7 @@ struct digamma_impl { p += 1.0; nz = q - p; } - nz = PI / ::tan(PI * nz); + nz = M_PI / ::tan(M_PI * nz); } else { nz = 0.0; -- cgit v1.2.3 From f2471f31e0d65203c9e098727facdfda2ff7a076 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 28 Dec 2015 17:48:38 -0800 Subject: Modify constants in SpecialFunctions to lowercase (avoid name conflicts). --- Eigen/src/Core/SpecialFunctions.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index a5f9cb62a..8cf26f4d1 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -189,8 +189,8 @@ struct digamma_impl { bool negative; // Some necessary constants - const float M_PIF = 3.141592653589793238; - const float MAXNUMF = std::numeric_limits::infinity(); + const float m_pif = 3.141592653589793238; + const float maxnumf = std::numeric_limits::infinity(); const float A[] = { -4.16666666666666666667E-3, @@ -207,7 +207,7 @@ struct digamma_impl { q = x; p = ::floor(q); if (p == q) { - return (MAXNUMF); + return (maxnumf); } nz = q - p; if (nz != 0.5f) { @@ -215,7 +215,7 @@ struct digamma_impl { p += 1.0f; nz = q - p; } - nz = M_PIF / ::tan(M_PIF * nz); + nz = m_pif / ::tan(m_pif * nz); } else { nz = 0.0f; } @@ -314,8 +314,8 @@ struct digamma_impl { 8.33333333333333333333E-2 }; - const double MAXNUM = std::numeric_limits::infinity(); - const double M_PI = 3.14159265358979323846; + const double maxnum = std::numeric_limits::infinity(); + const double m_pi = 3.14159265358979323846; negative = 0; nz = 0.0; @@ -325,9 +325,9 @@ struct digamma_impl { q = x; p = ::floor(q); if (p == q) { - return MAXNUM; + return maxnum; } - /* Remove the zeros of tan(M_PI x) + /* Remove the zeros of tan(m_pi x) * by subtracting the nearest integer from x */ nz = q - p; @@ -336,7 +336,7 @@ struct digamma_impl { p += 1.0; nz = q - p; } - nz = M_PI / ::tan(M_PI * nz); + nz = m_pi / ::tan(m_pi * nz); } else { nz = 0.0; -- cgit v1.2.3 From 25f2b8d82423137efc0e446425016a375e87d5fa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 Dec 2015 15:50:11 +0100 Subject: bug #1141: add missing initialization of CholmodBase::m_*IsOk --- Eigen/src/CholmodSupport/CholmodSupport.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 06421d5ed..5be5a84cb 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -178,14 +178,14 @@ class CholmodBase : public SparseSolverBase public: CholmodBase() - : m_cholmodFactor(0), m_info(Success) + : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false) { m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0); cholmod_start(&m_cholmod); } explicit CholmodBase(const MatrixType& matrix) - : m_cholmodFactor(0), m_info(Success) + : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false) { m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0); cholmod_start(&m_cholmod); -- cgit v1.2.3 From 978c379ed7b42ac83c7e3fc84abbd88b9ec4e38b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 12:52:38 +0100 Subject: Add missing ctor from uint --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index f5cca0ad7..4f2adb671 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -38,6 +38,8 @@ struct TensorUInt128 eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(unsigned int x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); } -- cgit v1.2.3 From 16dd82ed511ccaa1c783e71d46c1d2bcd9e19907 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 15:11:42 +0100 Subject: Add missing snippet for sign/cwiseSign functions. --- doc/snippets/Cwise_sign.cpp | 2 ++ doc/snippets/MatrixBase_cwiseSign.cpp | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 doc/snippets/Cwise_sign.cpp create mode 100644 doc/snippets/MatrixBase_cwiseSign.cpp diff --git a/doc/snippets/Cwise_sign.cpp b/doc/snippets/Cwise_sign.cpp new file mode 100644 index 000000000..49920e4f1 --- /dev/null +++ b/doc/snippets/Cwise_sign.cpp @@ -0,0 +1,2 @@ +Array3d v(-3,5,0); +cout << v.sign() << endl; diff --git a/doc/snippets/MatrixBase_cwiseSign.cpp b/doc/snippets/MatrixBase_cwiseSign.cpp new file mode 100644 index 000000000..efd717955 --- /dev/null +++ b/doc/snippets/MatrixBase_cwiseSign.cpp @@ -0,0 +1,4 @@ +MatrixXd m(2,3); +m << 2, -4, 6, + -5, 1, 0; +cout << m.cwiseSign() << endl; -- cgit v1.2.3 From b84cefe61dcc42ca83c37bbadf0ca0463f711758 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 15:12:15 +0100 Subject: Add missing snippets for erf/erfc/lgamma functions. --- doc/snippets/Cwise_erf.cpp | 2 ++ doc/snippets/Cwise_erfc.cpp | 2 ++ doc/snippets/Cwise_lgamma.cpp | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 doc/snippets/Cwise_erf.cpp create mode 100644 doc/snippets/Cwise_erfc.cpp create mode 100644 doc/snippets/Cwise_lgamma.cpp diff --git a/doc/snippets/Cwise_erf.cpp b/doc/snippets/Cwise_erf.cpp new file mode 100644 index 000000000..7f51c1b6a --- /dev/null +++ b/doc/snippets/Cwise_erf.cpp @@ -0,0 +1,2 @@ +Array4d v(-0.5,2,0,-7); +cout << v.erf() << endl; diff --git a/doc/snippets/Cwise_erfc.cpp b/doc/snippets/Cwise_erfc.cpp new file mode 100644 index 000000000..f0453d4b1 --- /dev/null +++ b/doc/snippets/Cwise_erfc.cpp @@ -0,0 +1,2 @@ +Array4d v(-0.5,2,0,-7); +cout << v.erfc() << endl; diff --git a/doc/snippets/Cwise_lgamma.cpp b/doc/snippets/Cwise_lgamma.cpp new file mode 100644 index 000000000..cbc69b989 --- /dev/null +++ b/doc/snippets/Cwise_lgamma.cpp @@ -0,0 +1,2 @@ +Array4d v(0.5,10,0,-1); +cout << v.lgamma() << endl; \ No newline at end of file -- cgit v1.2.3 From 5fae3750b5a110c495547429e46016d29f6f0c9b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:02:05 +0100 Subject: Recent versions of doxygen miss-parsed Eigen/* headers --- doc/Doxyfile.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index e0c6a7e34..f9d5af812 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -273,7 +273,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = .h=C++ no_extension=C++ # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable @@ -803,7 +803,7 @@ EXAMPLE_RECURSIVE = NO # directories that contain image that are included in the documentation (see # the \image command). -IMAGE_PATH = +IMAGE_PATH = ${Eigen_BINARY_DIR}/doc/html # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program -- cgit v1.2.3 From 162ccb2938529e897a0e65821a5ce18655be23ce Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:03:14 +0100 Subject: Fix links to Eigen2-to-Eigen3 porting helpers --- doc/A05_PortingFrom2To3.dox | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/A05_PortingFrom2To3.dox b/doc/A05_PortingFrom2To3.dox index 2d9182bbb..0dbddb976 100644 --- a/doc/A05_PortingFrom2To3.dox +++ b/doc/A05_PortingFrom2To3.dox @@ -9,8 +9,8 @@ and gives tips to help porting your application from Eigen2 to Eigen3. \section CompatibilitySupport Eigen2 compatibility support -Up to version 3.2 %Eigen provides Eigen2 support modes. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs. -You can still use them by first porting your code to Eigen 3.2. +Up to version 3.2 %Eigen provides Eigen2 support modes. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs. +You can still use them by first porting your code to Eigen 3.2. \section Using The USING_PART_OF_NAMESPACE_EIGEN macro @@ -223,7 +223,7 @@ triangular part to work on \section GeometryModule Changes in the Geometry module -The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the \ref Eigen2SupportModes "Eigen 2 support modes" to perform your migration. +The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the "Eigen 2 support modes" to perform your migration. \section Transform The Transform class -- cgit v1.2.3 From 29bb599e0392833667f9dc9df4b2d34145db60da Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:04:24 +0100 Subject: Fix numerous doxygen issues in auto-link generation --- Eigen/Geometry | 12 ++++++------ Eigen/src/CholmodSupport/CholmodSupport.h | 8 ++++---- Eigen/src/Core/CwiseNullaryOp.h | 6 +++--- Eigen/src/Core/Dot.h | 8 ++++++-- Eigen/src/Core/MatrixBase.h | 4 ++++ Eigen/src/Core/PlainObjectBase.h | 6 +++--- Eigen/src/Core/util/Constants.h | 2 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 7 +++---- Eigen/src/Geometry/Homogeneous.h | 8 ++++---- Eigen/src/Geometry/OrthoMethods.h | 18 ++++++++++++------ Eigen/src/Geometry/Scaling.h | 5 +++-- Eigen/src/PaStiXSupport/PaStiXSupport.h | 6 +++--- Eigen/src/PardisoSupport/PardisoSupport.h | 6 +++--- Eigen/src/SVD/SVDBase.h | 6 +++--- Eigen/src/SparseCore/SparseRef.h | 2 +- Eigen/src/SparseLU/SparseLU.h | 2 +- Eigen/src/SuperLUSupport/SuperLUSupport.h | 4 ++-- Eigen/src/UmfPackSupport/UmfPackSupport.h | 4 +++- doc/QuickReference.dox | 2 +- doc/SparseLinearSystems.dox | 14 +++++++------- doc/StructHavingEigenMembers.dox | 6 +++--- doc/TopicLazyEvaluation.dox | 4 ++-- doc/TutorialArrayClass.dox | 2 +- doc/TutorialReductionsVisitorsBroadcasting.dox | 4 ++-- 24 files changed, 81 insertions(+), 65 deletions(-) diff --git a/Eigen/Geometry b/Eigen/Geometry index 06b736e3f..716d52952 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -17,16 +17,16 @@ #include /** \defgroup Geometry_Module Geometry module - * - * * * This module provides support for: * - fixed-size homogeneous transformations * - translation, scaling, 2D and 3D rotations - * - quaternions - * - \ref MatrixBase::cross() "cross product" - * - \ref MatrixBase::unitOrthogonal() "orthognal vector generation" - * - some linear components: parametrized-lines and hyperplanes + * - \link Quaternion quaternions \endlink + * - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3) + * - orthognal vector generation (\ref MatrixBase::unitOrthogonal) + * - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink + * - \link AlignedBox axis aligned bounding boxes \endlink + * - \link umeyama least-square transformation fitting \endlink * * \code * #include diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 5be5a84cb..2da962471 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -358,7 +358,7 @@ class CholmodBase : public SparseSolverBase * * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed. * - * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT + * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT */ template class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> > @@ -407,7 +407,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl * * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed. * - * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT + * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT */ template class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> > @@ -454,7 +454,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp * * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed. * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept */ template class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> > @@ -503,7 +503,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper * * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed. * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept */ template class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> > diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 2bc6933d9..1dd109d3d 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -328,7 +328,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) setConstant(val); } -/** Sets all coefficients in this expression to \a value. +/** Sets all coefficients in this expression to value \a val. * * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ @@ -338,7 +338,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) return derived() = Constant(rows(), cols(), val); } -/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value. +/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val. * * \only_for_vectors * @@ -355,7 +355,7 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) return setConstant(val); } -/** Resizes to the given size, and sets all coefficients in this expression to the given \a value. +/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val. * * \param rows the new number of rows * \param cols the new number of columns diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 003450f1a..c5040c67b 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -82,7 +82,7 @@ MatrixBase::dot(const MatrixBase& other) const * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * - * \sa dot(), norm() + * \sa dot(), norm(), lpNorm() */ template EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const @@ -94,7 +94,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * In both cases, it consists in the square root of the sum of the square of all the matrix entries. * For vectors, this is also equals to the square root of the dot product of \c *this with itself. * - * \sa dot(), squaredNorm() + * \sa lpNorm(), dot(), squaredNorm() */ template inline typename NumTraits::Scalar>::Real MatrixBase::norm() const @@ -188,7 +188,11 @@ struct lpNorm_selector */ template template +#ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NumTraits::Scalar>::Real +#else +MatrixBase::RealScalar +#endif MatrixBase::lpNorm() const { return internal::lpNorm_selector::run(*this); diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 9d612c852..8d400d277 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -386,7 +386,11 @@ template class MatrixBase #endif // EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC +#ifndef EIGEN_PARSED_BY_DOXYGEN inline typename cross_product_return_type::type +#else + inline PlainObject +#endif cross(const MatrixBase& other) const; template diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1225e85b4..b7a4fcea8 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -533,7 +533,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type public: - /** \copydoc MatrixBase::operator=(const EigenBase&) + /** \copydoc DenseBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC @@ -618,8 +618,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type //@} using Base::setConstant; - EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value); - EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value); + EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); using Base::setZero; EIGEN_DEVICE_FUNC Derived& setZero(Index size); diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index a364f48d1..ec4a79e99 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -224,7 +224,7 @@ enum { /** \ingroup enums * Enum for indicating whether a buffer is aligned or not. */ -enum { +enum Foo { Unaligned=0, /**< Data pointer has no specific alignment. */ Aligned8=8, /**< Data pointer is aligned on a 8 bytes boundary. */ Aligned16=16, /**< Data pointer is aligned on a 16 bytes boundary. */ diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index c64555096..dc9cd0a1a 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -228,6 +228,7 @@ template class SelfAdjointEigenSolver * * \param[in] diag The vector containing the diagonal of the matrix. * \param[in] subdiag The subdiagonal of the matrix. + * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly. * \returns Reference to \c *this * * This function assumes that the matrix has been reduced to tridiagonal form. @@ -299,8 +300,7 @@ template class SelfAdjointEigenSolver * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out * - * \sa operatorInverseSqrt(), - * \ref MatrixFunctions_Module "MatrixFunctions Module" + * \sa operatorInverseSqrt(), MatrixFunctions Module */ EIGEN_DEVICE_FUNC MatrixType operatorSqrt() const @@ -325,8 +325,7 @@ template class SelfAdjointEigenSolver * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out * - * \sa operatorSqrt(), MatrixBase::inverse(), - * \ref MatrixFunctions_Module "MatrixFunctions Module" + * \sa operatorSqrt(), MatrixBase::inverse(), MatrixFunctions Module */ EIGEN_DEVICE_FUNC MatrixType operatorInverseSqrt() const diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h index 4107fba4d..367fd3930 100644 --- a/Eigen/src/Geometry/Homogeneous.h +++ b/Eigen/src/Geometry/Homogeneous.h @@ -112,7 +112,7 @@ template class Homogeneous typename MatrixType::Nested m_matrix; }; -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \return an expression of the equivalent homogeneous vector * @@ -131,7 +131,7 @@ MatrixBase::homogeneous() const return HomogeneousReturnType(derived()); } -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \returns a matrix expression of homogeneous column (or row) vectors * @@ -146,7 +146,7 @@ VectorwiseOp::homogeneous() const return HomogeneousReturnType(_expression()); } -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \returns an expression of the homogeneous normalized vector of \c *this * @@ -164,7 +164,7 @@ MatrixBase::hnormalized() const ColsAtCompileTime==1?1:size()-1) / coeff(size()-1); } -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \returns an expression of the homogeneous normalized vector of \c *this * diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h index 39b64b869..c3648f51f 100644 --- a/Eigen/src/Geometry/OrthoMethods.h +++ b/Eigen/src/Geometry/OrthoMethods.h @@ -13,7 +13,7 @@ namespace Eigen { -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \returns the cross product of \c *this and \a other * @@ -26,7 +26,11 @@ namespace Eigen { */ template template +#ifndef EIGEN_PARSED_BY_DOXYGEN inline typename MatrixBase::template cross_product_return_type::type +#else +inline typename MatrixBase::PlainObject +#endif MatrixBase::cross(const MatrixBase& other) const { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3) @@ -63,7 +67,7 @@ struct cross3_impl { } -/** \geometry_module +/** \geometry_module \ingroup Geometry_Module * * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients * @@ -90,14 +94,14 @@ MatrixBase::cross3(const MatrixBase& other) const typename internal::remove_all::type>::run(lhs,rhs); } -/** \returns a matrix expression of the cross product of each column or row +/** \geometry_module \ingroup Geometry_Module + * + * \returns a matrix expression of the cross product of each column or row * of the referenced expression with the \a other vector. * * The referenced matrix must have one dimension equal to 3. * The result matrix has the same dimensions than the referenced one. * - * \geometry_module - * * \sa MatrixBase::cross() */ template template @@ -207,7 +211,9 @@ struct unitOrthogonal_selector } // end namespace internal -/** \returns a unit vector which is orthogonal to \c *this +/** \geometry_module \ingroup Geometry_Module + * + * \returns a unit vector which is orthogonal to \c *this * * The size of \c *this must be at least 2. If the size is exactly 2, * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized(). diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index 023fba2ee..e94aa189f 100644 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -104,6 +104,9 @@ public: }; +/** \addtogroup Geometry_Module */ +//@{ + /** Concatenates a linear transformation matrix and a uniform scaling */ // NOTE this operator is defiend in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC @@ -136,8 +139,6 @@ template static inline const DiagonalWrapper Scaling(const MatrixBase& coeffs) { return coeffs.asDiagonal(); } -/** \addtogroup Geometry_Module */ -//@{ /** \deprecated */ typedef DiagonalMatrix AlignedScaling2f; /** \deprecated */ diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 1999fd289..c8cb2c0cc 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -405,7 +405,7 @@ bool PastixBase::_solve_impl(const MatrixBase &b, MatrixBase &x * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SparseLU * */ template @@ -518,7 +518,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> > * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT */ template class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > @@ -601,7 +601,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT */ template class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> > diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 7c238ce3c..80d914f25 100755 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -375,7 +375,7 @@ void PardisoImpl::_solve_impl(const MatrixBase &b, MatrixBase * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SparseLU */ template class PardisoLU : public PardisoImpl< PardisoLU > @@ -427,7 +427,7 @@ class PardisoLU : public PardisoImpl< PardisoLU > * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT */ template class PardisoLLT : public PardisoImpl< PardisoLLT > @@ -487,7 +487,7 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT */ template class PardisoLDLT : public PardisoImpl< PardisoLDLT > diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index ad191085e..e2d77a761 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -42,7 +42,7 @@ namespace Eigen { * * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to * terminate in finite (and reasonable) time. - * \sa MatrixBase::genericSvd() + * \sa class BDCSVD, class JacobiSVD */ template class SVDBase @@ -74,7 +74,7 @@ public: /** \returns the \a U matrix. * * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, - * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU. + * the U matrix is n-by-n if you asked for \link Eigen::ComputeFullU ComputeFullU \endlink, and is n-by-m if you asked for \link Eigen::ComputeThinU ComputeThinU \endlink. * * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed. * @@ -90,7 +90,7 @@ public: /** \returns the \a V matrix. * * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, - * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV. + * the V matrix is p-by-p if you asked for \link Eigen::ComputeFullV ComputeFullV \endlink, and is p-by-m if you asked for \link Eigen::ComputeThinV ComputeThinV \endlink. * * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed. * diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h index 19e06fc80..605ca42ba 100644 --- a/Eigen/src/SparseCore/SparseRef.h +++ b/Eigen/src/SparseCore/SparseRef.h @@ -13,7 +13,7 @@ namespace Eigen { enum { - StandardCompressedFormat = 2 + StandardCompressedFormat = 2 /**< used by Ref to specify whether the input storage must be in standard compressed form */ }; namespace internal { diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index d33d27f46..8d03870b1 100755 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -67,7 +67,7 @@ template struct SparseLUMatrixURetu * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept * \sa \ref OrderingMethods_Module */ template diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index fd2b26581..0ae3017cc 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -456,7 +456,7 @@ class SuperLUBase : public SparseSolverBase * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers + * \sa \ref TutorialSparseSolverConcept, class SparseLU */ template class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> > @@ -809,7 +809,7 @@ typename SuperLU::Scalar SuperLU::determinant() const * * \implsparsesolverconcept * - * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB + * \sa \ref TutorialSparseSolverConcept, class IncompleteLUT, class ConjugateGradient, class BiCGSTAB */ template diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index aaec8c6f1..929a01acb 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -126,7 +126,9 @@ inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *N * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * - * \sa \ref TutorialSparseDirectSolvers + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class SparseLU */ template class UmfPackLU : public SparseSolverBase > diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 62b39b201..e19c7e3a4 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -21,7 +21,7 @@ The Eigen library is divided in a Core module and several additional modules. Ea \link SVD_Module SVD \endlink\code#include \endcodeSVD decompositions with least-squares solver (JacobiSVD, BDCSVD) \link QR_Module QR \endlink\code#include \endcodeQR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR) \link Eigenvalues_Module Eigenvalues \endlink\code#include \endcodeEigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver) -\link Sparse_modules Sparse \endlink\code#include \endcode%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules) +\link Sparse_Module Sparse \endlink\code#include \endcode%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules) \code#include \endcodeIncludes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files \code#include \endcodeIncludes %Dense and %Sparse header files (the whole Eigen library) diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 9fb3282e7..61cc50afa 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -15,20 +15,20 @@ They are summarized in the following tables: ClassSolver kindMatrix kindFeatures related to performance License

Notes

-SimplicialLLT \n #includeDirect LLt factorizationSPDFill-in reducing +SimplicialLLT \n \#includeDirect LLt factorizationSPDFill-in reducing LGPL SimplicialLDLT is often preferable -SimplicialLDLT \n #includeDirect LDLt factorizationSPDFill-in reducing +SimplicialLDLT \n \#includeDirect LDLt factorizationSPDFill-in reducing LGPL Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.) -SparseLU \n #include LU factorization +SparseLU \n \#include LU factorization Square Fill-in reducing, Leverage fast dense algebra MPL2 optimized for small and large problems with irregular patterns -SparseQR \n #include QR factorization +SparseQR \n \#include QR factorization Any, rectangular Fill-in reducing MPL2 recommended for least-square problems, has a basic rank-revealing feature @@ -40,17 +40,17 @@ They are summarized in the following tables: ClassSolver kindMatrix kindSupported preconditioners, [default] License

Notes

-ConjugateGradient \n #include Classic iterative CGSPD +ConjugateGradient \n \#include Classic iterative CGSPD IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholesky MPL2 Recommended for large symmetric problems (e.g., 3D Poisson eq.) -LeastSquaresConjugateGradient \n #includeCG for rectangular least-square problemRectangular +LeastSquaresConjugateGradient \n \#includeCG for rectangular least-square problemRectangular IdentityPreconditioner, [LeastSquareDiagonalPreconditioner] MPL2 Solve for min |A'Ax-b|^2 without forming A'A -BiCGSTAB \n #includeIterative stabilized bi-conjugate gradientSquare +BiCGSTAB \n \#includeIterative stabilized bi-conjugate gradientSquare IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUT MPL2 To speedup the convergence, try it with the \ref IncompleteLUT preconditioner. diff --git a/doc/StructHavingEigenMembers.dox b/doc/StructHavingEigenMembers.dox index bd4fa7599..7fbed0eb0 100644 --- a/doc/StructHavingEigenMembers.dox +++ b/doc/StructHavingEigenMembers.dox @@ -6,7 +6,7 @@ namespace Eigen { \section StructHavingEigenMembers_summary Executive Summary -If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you. +If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, %Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you. \section StructHavingEigenMembers_what What kind of code needs to be changed? @@ -48,7 +48,7 @@ Foo *foo = new Foo; This macro makes "new Foo" always return an aligned pointer. -If this approach is too intrusive, see also the \ref othersolutions. +If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions". \section StructHavingEigenMembers_why Why is this needed? @@ -67,7 +67,7 @@ class Foo Foo *foo = new Foo; \endcode -A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault. +A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that %Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault. For this reason, Eigen takes care by itself to require 128-bit alignment for Eigen::Vector2d, by doing two things: \li Eigen requires 128-bit alignment for the Eigen::Vector2d's array (of 2 doubles). With GCC, this is done with a __attribute__ ((aligned(16))). diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox index 393bc41d8..101ef8c72 100644 --- a/doc/TopicLazyEvaluation.dox +++ b/doc/TopicLazyEvaluation.dox @@ -36,7 +36,7 @@ Here is now a more involved example: Eigen chooses lazy evaluation at every stage in that example, which is clearly the correct choice. In fact, lazy evaluation is the "default choice" and Eigen will choose it except in a few circumstances. -The first circumstance in which Eigen chooses immediate evaluation, is when it sees an assignment a = b; and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. The most important example of such an expression is the \link GeneralProduct matrix product expression\endlink. For example, when you do +The first circumstance in which Eigen chooses immediate evaluation, is when it sees an assignment a = b; and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. The most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do \code matrix = matrix * matrix; \endcode @@ -48,7 +48,7 @@ What if you know that the result does no alias the operand of the product and wa Here, since we know that matrix2 is not the same matrix as matrix1, we know that lazy evaluation is not dangerous, so we may force lazy evaluation. Concretely, the effect of noalias() here is to bypass the evaluate-before-assigning \link flags flag\endlink. -The second circumstance in which Eigen chooses immediate evaluation, is when it sees a nested expression such as a + b where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. Again, the most important example of such an expression is the \link GeneralProduct matrix product expression\endlink. For example, when you do +The second circumstance in which Eigen chooses immediate evaluation, is when it sees a nested expression such as a + b where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. Again, the most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do \code matrix1 = matrix2 + matrix3 * matrix4; \endcode diff --git a/doc/TutorialArrayClass.dox b/doc/TutorialArrayClass.dox index 6432684aa..f6f351091 100644 --- a/doc/TutorialArrayClass.dox +++ b/doc/TutorialArrayClass.dox @@ -157,7 +157,7 @@ The following example shows how to use array operations on a Matrix object by em * to multiply them coefficient-wise and assigns the result to the matrix variable \c result (this is legal because Eigen allows assigning array expressions to matrix variables). -As a matter of fact, this usage case is so common that Eigen provides a \link MatrixBase::cwiseProduct() const +As a matter of fact, this usage case is so common that Eigen provides a \link MatrixBase::cwiseProduct const .cwiseProduct(.) \endlink method for matrices to compute the coefficient-wise product. This is also shown in the example program. diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox index 908a1b4b2..6d25ff0ea 100644 --- a/doc/TutorialReductionsVisitorsBroadcasting.dox +++ b/doc/TutorialReductionsVisitorsBroadcasting.dox @@ -32,7 +32,7 @@ Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which r These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things. -If you want other coefficient-wise \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. +If you want other coefficient-wise \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm lpNorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. The following example demonstrates these methods. @@ -90,7 +90,7 @@ Array. The arguments passed to a visitor are pointers to the variables where the row and column position are to be stored. These variables should be of type -\link DenseBase::Index Index \endlink, as shown below: +\link Eigen::Index Index \endlink, as shown below: -- cgit v1.2.3 From eadc377b3fb3d9586ab3c468fc03b5ef81ddac6d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:43:19 +0100 Subject: Add missing doc of Derived template parameter --- Eigen/src/SparseCore/SparseMatrixBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 648ae1f8a..2a90f40bf 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -18,7 +18,7 @@ namespace Eigen { * * \brief Base class of any sparse matrices or sparse expressions * - * \tparam Derived + * \tparam Derived is the derived type, e.g. a sparse matrix type, or an expression, etc. * * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN. -- cgit v1.2.3 From addb7066e8005224cb7723900aa9f3ab2055a10b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:45:44 +0100 Subject: Workaround "empty paragraph" warning with clang -Wdocumentation --- Eigen/src/Core/CommaInitializer.h | 2 +- Eigen/src/Core/DenseBase.h | 4 ++-- Eigen/src/Core/DenseCoeffsBase.h | 4 ++-- Eigen/src/Core/EigenBase.h | 2 +- Eigen/src/Core/MatrixBase.h | 2 +- Eigen/src/Core/PermutationMatrix.h | 10 +++++----- Eigen/src/Core/util/Constants.h | 6 +++--- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/Core/util/XprHelper.h | 2 +- doc/Doxyfile.in | 3 ++- 10 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index 89bcd750c..2abc6605c 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -22,7 +22,7 @@ namespace Eigen { * the return type of MatrixBase::operator<<, and most of the time this is the only * way it is used. * - * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished() + * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished() */ template struct CommaInitializer diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index e181dafaf..2c5c0ad28 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -36,7 +36,7 @@ static inline void check_DenseIndex_is_signed() { * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN @@ -60,7 +60,7 @@ template class DenseBase * \brief The type used to store indices * \details This typedef is relevant for types that store multiple indices such as * PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index - * \sa \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase. + * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase. */ typedef typename internal::traits::StorageIndex StorageIndex; diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 820a90e6f..28cc1432c 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -448,7 +448,7 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read-only using * \c operator() . * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template class DenseCoeffsBase : public DenseCoeffsBase @@ -521,7 +521,7 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read/write using * \c operator(). * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template class DenseCoeffsBase diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index 79dabda37..ba8e09674 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -23,7 +23,7 @@ namespace Eigen { * * Notice that this class is trivial, it is only used to disambiguate overloaded functions. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template struct EigenBase { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 8d400d277..f3935802d 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -43,7 +43,7 @@ namespace Eigen { * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template class MatrixBase : public DenseBase diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index 90e1df233..aad4ccecd 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -192,13 +192,13 @@ class PermutationBase : public EigenBase /** \returns the inverse permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ inline InverseReturnType inverse() const { return InverseReturnType(derived()); } /** \returns the tranpose permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ inline InverseReturnType transpose() const { return InverseReturnType(derived()); } @@ -225,7 +225,7 @@ class PermutationBase : public EigenBase /** \returns the product permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template inline PlainPermutationType operator*(const PermutationBase& other) const @@ -233,7 +233,7 @@ class PermutationBase : public EigenBase /** \returns the product of a permutation with another inverse permutation. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template inline PlainPermutationType operator*(const InverseImpl& other) const @@ -241,7 +241,7 @@ class PermutationBase : public EigenBase /** \returns the product of an inverse permutation with another permutation. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template friend inline PlainPermutationType operator*(const InverseImpl& other, const PermutationBase& perm) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index ec4a79e99..960bb434d 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -56,8 +56,8 @@ const int HugeCost = 10000; * for a matrix, this means that the storage order is row-major. * If this bit is not set, the storage order is column-major. * For an expression, this determines the storage order of - * the matrix created by evaluation of that expression. - * \sa \ref TopicStorageOrders */ + * the matrix created by evaluation of that expression. + * \sa \blank \ref TopicStorageOrders */ const unsigned int RowMajorBit = 0x1; /** \ingroup flags @@ -168,7 +168,7 @@ const unsigned int NestByRefBit = 0x100; * can be either row-major or column-major. * The precise choice will be decided at evaluation time or when * combined with other expressions. - * \sa \ref RowMajorBit, \ref TopicStorageOrders */ + * \sa \blank \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; /** \ingroup flags diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 1fc535a3a..1a899ea6c 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -784,7 +784,7 @@ template void swap(scoped_array &a,scoped_array &b) * std::map< int, Vector3f > my_map_vec3; * \endcode * -* \sa \ref TopicStlContainers. +* \sa \blank \ref TopicStlContainers. */ template class aligned_allocator : public std::allocator diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f9e2959cc..6b93d5221 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -29,7 +29,7 @@ typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex; /** * \brief The Index type as used for the API. * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \sa \ref TopicPreprocessorDirectives, StorageIndex. + * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex. */ typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index; diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index f9d5af812..0a43c7c4e 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -224,7 +224,8 @@ ALIASES = "only_for_vectors=This is only for vectors (either row- "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \ "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \ "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" \ - "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink." + "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink." \ + "blank= " ALIASES += "eigenAutoToc= " -- cgit v1.2.3 From 70404e07c246b986bac4661428dda92b182725a2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:46:45 +0100 Subject: Workaround clang -Wdocumentation warning about "/*<" --- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 4f45ddfbf..fd7f4d740 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -255,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); } @@ -456,7 +456,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +EIGEN_STRONG_INLINE Packet1cd pcplxflip/* */(const Packet1cd& x) { return Packet1cd(preverse(Packet2d(x.v))); } -- cgit v1.2.3 From 9900782e882b9429e44ad4902476cbaa489edbfa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Dec 2015 16:47:49 +0100 Subject: Mark AlignedBit and EvalBeforeNestingBit with deprecated attribute, and remove the remaining usages of EvalBeforeNestingBit. --- Eigen/src/Core/TriangularMatrix.h | 9 +-------- Eigen/src/Core/util/Constants.h | 6 +++--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 099a02ec3..7d6a97848 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -595,14 +595,7 @@ template template void TriangularBase::evalTo(MatrixBase &other) const { - if(internal::traits::Flags & EvalBeforeAssigningBit) - { - typename internal::plain_matrix_type::type other_evaluated(rows(), cols()); - evalToLazy(other_evaluated); - other.derived().swap(other_evaluated); - } - else - evalToLazy(other.derived()); + evalToLazy(other.derived()); } /*************************************************************************** diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 960bb434d..9e6816021 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -67,6 +67,7 @@ const unsigned int EvalBeforeNestingBit = 0x2; /** \ingroup flags * \deprecated * means the expression should be evaluated before any assignment */ +EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated /** \ingroup flags @@ -158,7 +159,7 @@ const unsigned int DirectAccessBit = 0x40; * expression.packet(0); * \endcode */ -const unsigned int AlignedBit = 0x80; +EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80; const unsigned int NestByRefBit = 0x100; @@ -187,8 +188,7 @@ const unsigned int CompressedAccessBit = 0x400; // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit - | EvalBeforeNestingBit - | EvalBeforeAssigningBit; + | EvalBeforeNestingBit; /** \defgroup enums Enumerations * \ingroup Core_Module -- cgit v1.2.3 From 8b0d1eb0f7f1ab017a8e603f3887143df15662d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 1 Jan 2016 21:45:06 +0100 Subject: Fix numerous doxygen shortcomings, and workaround some clang -Wdocumentation warnings --- Eigen/src/Cholesky/LDLT.h | 4 +- Eigen/src/Cholesky/LLT.h | 9 +- Eigen/src/Core/Array.h | 24 ++-- Eigen/src/Core/BandMatrix.h | 24 ++-- Eigen/src/Core/Block.h | 69 +++++----- Eigen/src/Core/CwiseBinaryOp.h | 39 +++--- Eigen/src/Core/CwiseNullaryOp.h | 25 ++-- Eigen/src/Core/CwiseUnaryOp.h | 39 +++--- Eigen/src/Core/CwiseUnaryView.h | 27 ++-- Eigen/src/Core/IO.h | 2 +- Eigen/src/Core/Map.h | 45 ++++--- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/Matrix.h | 86 ++++++------- Eigen/src/Core/NestByValue.h | 15 ++- Eigen/src/Core/NoAlias.h | 2 +- Eigen/src/Core/NumTraits.h | 2 +- Eigen/src/Core/PermutationMatrix.h | 65 +++++----- Eigen/src/Core/Product.h | 31 +++-- Eigen/src/Core/Ref.h | 140 ++++++++++----------- Eigen/src/Core/Replicate.h | 31 ++--- Eigen/src/Core/ReturnByValue.h | 9 +- Eigen/src/Core/Reverse.h | 28 ++--- Eigen/src/Core/Stride.h | 4 +- Eigen/src/Core/Transpose.h | 27 ++-- Eigen/src/Core/Transpositions.h | 58 ++++----- Eigen/src/Core/VectorBlock.h | 25 ++-- Eigen/src/Core/VectorwiseOp.h | 8 +- Eigen/src/Core/util/XprHelper.h | 8 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 15 ++- Eigen/src/Geometry/Hyperplane.h | 4 +- Eigen/src/Geometry/ParametrizedLine.h | 4 +- Eigen/src/Geometry/Rotation2D.h | 2 +- Eigen/src/Geometry/RotationBase.h | 8 +- Eigen/src/Geometry/Scaling.h | 2 +- Eigen/src/Geometry/Translation.h | 4 +- .../IterativeLinearSolvers/IncompleteCholesky.h | 2 +- Eigen/src/LU/FullPivLU.h | 2 +- Eigen/src/LU/PartialPivLU.h | 2 +- Eigen/src/OrderingMethods/Amd.h | 7 +- Eigen/src/OrderingMethods/Ordering.h | 9 +- Eigen/src/QR/ColPivHouseholderQR.h | 2 +- Eigen/src/QR/FullPivHouseholderQR.h | 2 +- Eigen/src/QR/HouseholderQR.h | 2 +- Eigen/src/SVD/BDCSVD.h | 5 +- Eigen/src/SVD/JacobiSVD.h | 4 +- Eigen/src/SparseCholesky/SimplicialCholesky.h | 10 +- Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 27 ++-- unsupported/Eigen/src/SparseExtra/RandomSetter.h | 6 +- 48 files changed, 476 insertions(+), 491 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 6fcae01f7..c3cc3746c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -28,8 +28,8 @@ namespace internal { * * \brief Robust Cholesky decomposition of a matrix with pivoting * - * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition - * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition + * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * The other triangular part won't be read. * * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 1f0091f3c..74cf5bfe1 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -22,8 +22,8 @@ template struct LLT_Traits; * * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features * - * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition - * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition + * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite @@ -436,10 +436,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * \param bAndX represents both the right-hand side matrix b and result x. * - * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD. - * - * This version avoids a copy when the right hand side matrix b is not - * needed anymore. + * This version avoids a copy when the right hand side matrix b is not needed anymore. * * \sa LLT::solve(), MatrixBase::llt() */ diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index e38eda72c..7480d1e24 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -12,7 +12,16 @@ namespace Eigen { -/** \class Array +namespace internal { +template +struct traits > : traits > +{ + typedef ArrayXpr XprKind; + typedef ArrayBase > XprBase; +}; +} + +/** \class Array * \ingroup Core_Module * * \brief General-purpose arrays with easy API for coefficient-wise operations @@ -26,21 +35,12 @@ namespace Eigen { * * See documentation of class Matrix for detailed information on the template parameters * storage layout. - * + * * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN. * - * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy + * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy */ -namespace internal { -template -struct traits > : traits > -{ - typedef ArrayXpr XprKind; - typedef ArrayBase > XprBase; -}; -} - template class Array : public PlainObjectBase > diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index 87c124fdf..4978c9140 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h @@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase * * \brief Represents a rectangular matrix with a banded storage * - * \param _Scalar Numeric type, i.e. float, double, int - * \param Rows Number of rows, or \b Dynamic - * \param Cols Number of columns, or \b Dynamic - * \param Supers Number of super diagonal - * \param Subs Number of sub diagonal - * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint - * The former controls \ref TopicStorageOrders "storage order", and defaults to - * column-major. The latter controls whether the matrix represents a selfadjoint - * matrix in which case either Supers of Subs have to be null. + * \tparam _Scalar Numeric type, i.e. float, double, int + * \tparam _Rows Number of rows, or \b Dynamic + * \tparam _Cols Number of columns, or \b Dynamic + * \tparam _Supers Number of super diagonal + * \tparam _Subs Number of sub diagonal + * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint + * The former controls \ref TopicStorageOrders "storage order", and defaults to + * column-major. The latter controls whether the matrix represents a selfadjoint + * matrix in which case either Supers of Subs have to be null. * * \sa class TridiagonalMatrix */ @@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase(Index,Index) and - * most of the time this is the only way it is used. - * - * However, if you want to directly maniputate block expressions, - * for instance if you want to write a function returning such an expression, you - * will need to use this class. - * - * Here is an example illustrating the dynamic case: - * \include class_Block.cpp - * Output: \verbinclude class_Block.out - * - * \note Even though this expression has dynamic size, in the case where \a XprType - * has fixed size, this expression inherits a fixed maximal size which means that evaluating - * it does not cause a dynamic memory allocation. - * - * Here is an example illustrating the fixed-size case: - * \include class_FixedBlock.cpp - * Output: \verbinclude class_FixedBlock.out - * - * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock - */ - namespace internal { template struct traits > : traits @@ -101,6 +66,40 @@ template class BlockImpl; +/** \class Block + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size block + * + * \tparam XprType the type of the expression in which we are taking a block + * \tparam BlockRows the number of rows of the block we are taking at compile time (optional) + * \tparam BlockCols the number of columns of the block we are taking at compile time (optional) + * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or + * to set of columns of a column major matrix (optional). The parameter allows to determine + * at compile time whether aligned access is possible on the block expression. + * + * This class represents an expression of either a fixed-size or dynamic-size block. It is the return + * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block(Index,Index) and + * most of the time this is the only way it is used. + * + * However, if you want to directly maniputate block expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. + * + * Here is an example illustrating the dynamic case: + * \include class_Block.cpp + * Output: \verbinclude class_Block.out + * + * \note Even though this expression has dynamic size, in the case where \a XprType + * has fixed size, this expression inherits a fixed maximal size which means that evaluating + * it does not cause a dynamic memory allocation. + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedBlock.cpp + * Output: \verbinclude class_FixedBlock.out + * + * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock + */ template class Block : public BlockImpl::StorageKind> { diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index e42c3031b..f94629e6d 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -13,26 +13,6 @@ namespace Eigen { -/** \class CwiseBinaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions - * - * \param BinaryOp template functor implementing the operator - * \param Lhs the type of the left-hand side - * \param Rhs the type of the right-hand side - * - * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. - * It is the return type of binary operators, by which we mean only those binary operators where - * both the left-hand side and the right-hand side are Eigen expressions. - * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseBinaryOp types explicitly. - * - * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp - */ - namespace internal { template struct traits > @@ -74,6 +54,25 @@ struct traits > template class CwiseBinaryOpImpl; +/** \class CwiseBinaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions + * + * \tparam BinaryOp template functor implementing the operator + * \tparam LhsType the type of the left-hand side + * \tparam RhsType the type of the right-hand side + * + * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. + * It is the return type of binary operators, by which we mean only those binary operators where + * both the left-hand side and the right-hand side are Eigen expressions. + * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseBinaryOp types explicitly. + * + * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp + */ template class CwiseBinaryOp : public CwiseBinaryOpImpl< diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 1dd109d3d..67b99653f 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -12,13 +12,23 @@ namespace Eigen { +namespace internal { +template +struct traits > : traits +{ + enum { + Flags = traits::Flags & RowMajorBit + }; +}; +} + /** \class CwiseNullaryOp * \ingroup Core_Module * * \brief Generic expression of a matrix where all coefficients are defined by a functor * - * \param NullaryOp template functor implementing the operator - * \param PlainObjectType the underlying plain matrix/array type + * \tparam NullaryOp template functor implementing the operator + * \tparam PlainObjectType the underlying plain matrix/array type * * This class represents an expression of a generic nullary operator. * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods, @@ -29,17 +39,6 @@ namespace Eigen { * * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr() */ - -namespace internal { -template -struct traits > : traits -{ - enum { - Flags = traits::Flags & RowMajorBit - }; -}; -} - template class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp >::type, internal::no_assignment_operator { diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index da1d1992d..5a809cf21 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -13,26 +13,6 @@ namespace Eigen { -/** \class CwiseUnaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise unary operator is applied to an expression - * - * \param UnaryOp template functor implementing the operator - * \param XprType the type of the expression to which we are applying the unary operator - * - * This class represents an expression where a unary operator is applied to an expression. - * It is the return type of all operations taking exactly 1 input expression, regardless of the - * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix - * is considered unary, because only the right-hand side is an expression, and its - * return type is a specialization of CwiseUnaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseUnaryOp types explicitly. - * - * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp - */ - namespace internal { template struct traits > @@ -52,6 +32,25 @@ struct traits > template class CwiseUnaryOpImpl; +/** \class CwiseUnaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise unary operator is applied to an expression + * + * \tparam UnaryOp template functor implementing the operator + * \tparam XprType the type of the expression to which we are applying the unary operator + * + * This class represents an expression where a unary operator is applied to an expression. + * It is the return type of all operations taking exactly 1 input expression, regardless of the + * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix + * is considered unary, because only the right-hand side is an expression, and its + * return type is a specialization of CwiseUnaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseUnaryOp types explicitly. + * + * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp + */ template class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>, internal::no_assignment_operator { diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 72244751e..5a7db2b19 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -12,20 +12,6 @@ namespace Eigen { -/** \class CwiseUnaryView - * \ingroup Core_Module - * - * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector - * - * \param ViewOp template functor implementing the view - * \param MatrixType the type of the matrix we are applying the unary operator - * - * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. - * It is the return type of real() and imag(), and most of the time this is the only way it is used. - * - * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp - */ - namespace internal { template struct traits > @@ -55,6 +41,19 @@ struct traits > template class CwiseUnaryViewImpl; +/** \class CwiseUnaryView + * \ingroup Core_Module + * + * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector + * + * \tparam ViewOp template functor implementing the view + * \tparam MatrixType the type of the matrix we are applying the unary operator + * + * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. + * It is the return type of real() and imag(), and most of the time this is the only way it is used. + * + * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp + */ template class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind> { diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index 9ae37bb5a..dfd9097cc 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -80,7 +80,7 @@ struct IOFormat * * \brief Pseudo expression providing matrix output with given format * - * \param ExpressionType the type of the object on which IO stream operations are performed + * \tparam ExpressionType the type of the object on which IO stream operations are performed * * This class represents an expression with stream operators controlled by a given IOFormat. * It is the return type of DenseBase::format() diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 3a8375da9..06d196702 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -13,6 +13,28 @@ namespace Eigen { +namespace internal { +template +struct traits > + : public traits +{ + typedef traits TraitsBase; + enum { + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? int(PlainObjectType::InnerStrideAtCompileTime) + : int(StrideType::InnerStrideAtCompileTime), + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? int(PlainObjectType::OuterStrideAtCompileTime) + : int(StrideType::OuterStrideAtCompileTime), + Alignment = int(MapOptions)&int(AlignedMask), + Flags0 = TraitsBase::Flags & (~NestByRefBit), + Flags = is_lvalue::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) + }; +private: + enum { Options }; // Expressions don't have Options +}; +} + /** \class Map * \ingroup Core_Module * @@ -63,29 +85,6 @@ namespace Eigen { * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ - -namespace internal { -template -struct traits > - : public traits -{ - typedef traits TraitsBase; - enum { - InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 - ? int(PlainObjectType::InnerStrideAtCompileTime) - : int(StrideType::InnerStrideAtCompileTime), - OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) - : int(StrideType::OuterStrideAtCompileTime), - Alignment = int(MapOptions)&int(AlignedMask), - Flags0 = TraitsBase::Flags & (~NestByRefBit), - Flags = is_lvalue::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) - }; -private: - enum { Options }; // Expressions don't have Options -}; -} - template class Map : public MapBase > { diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 48cf565fb..4d5e1acb8 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -26,7 +26,7 @@ long double abs(long double x) { return (fabsl(x)); } namespace internal { -/** \internal \struct global_math_functions_filtering_base +/** \internal \class global_math_functions_filtering_base * * What it does: * Defines a typedef 'type' as follows: diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index ce1b70d23..bcbbbf9ae 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -13,6 +13,45 @@ namespace Eigen { +namespace internal { +template +struct traits > +{ +private: + enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; + typedef typename find_best_packet<_Scalar,size>::type PacketScalar; + enum { + row_major_bit = _Options&RowMajor ? RowMajorBit : 0, + is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, + max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, + default_alignment = compute_default_alignment<_Scalar,max_size>::value, + actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, + required_alignment = unpacket_traits::alignment, + packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 + }; + +public: + typedef _Scalar Scalar; + typedef Dense StorageKind; + typedef Eigen::Index StorageIndex; + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = _Rows, + ColsAtCompileTime = _Cols, + MaxRowsAtCompileTime = _MaxRows, + MaxColsAtCompileTime = _MaxCols, + Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, + Options = _Options, + InnerStrideAtCompileTime = 1, + OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, + + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase + EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, + Alignment = actual_alignment + }; +}; +} + /** \class Matrix * \ingroup Core_Module * @@ -98,7 +137,7 @@ namespace Eigen { * * * ABI and storage layout - * + * * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3. *
Example:Output:
* @@ -130,50 +169,11 @@ namespace Eigen { *
Matrix typeEquivalent C structure
* Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES. - * - * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, - * \ref TopicStorageOrders + * + * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, + * \ref TopicStorageOrders */ -namespace internal { -template -struct traits > -{ -private: - enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; - typedef typename find_best_packet<_Scalar,size>::type PacketScalar; - enum { - row_major_bit = _Options&RowMajor ? RowMajorBit : 0, - is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, - max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, - default_alignment = compute_default_alignment<_Scalar,max_size>::value, - actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, - required_alignment = unpacket_traits::alignment, - packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 - }; - -public: - typedef _Scalar Scalar; - typedef Dense StorageKind; - typedef Eigen::Index StorageIndex; - typedef MatrixXpr XprKind; - enum { - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _MaxRows, - MaxColsAtCompileTime = _MaxCols, - Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, - Options = _Options, - InnerStrideAtCompileTime = 1, - OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - - // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase - EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, - Alignment = actual_alignment - }; -}; -} - template class Matrix : public PlainObjectBase > diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 9aeaf8d18..13adf070e 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -13,25 +13,24 @@ namespace Eigen { +namespace internal { +template +struct traits > : public traits +{}; +} + /** \class NestByValue * \ingroup Core_Module * * \brief Expression which must be nested by value * - * \param ExpressionType the type of the object of which we are requiring nesting-by-value + * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value * * This class is the return type of MatrixBase::nestByValue() * and most of the time this is the only way it is used. * * \sa MatrixBase::nestByValue() */ - -namespace internal { -template -struct traits > : public traits -{}; -} - template class NestByValue : public internal::dense_xpr_base< NestByValue >::type { diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 0ade75255..ffb673cee 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Pseudo expression providing an operator = assuming no aliasing * - * \param ExpressionType the type of the object on which to do the lazy assignment + * \tparam ExpressionType the type of the object on which to do the lazy assignment * * This class represents an expression with special assignment operators * assuming no aliasing between the target expression and the source expression. diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 1d85dec72..d1aabd995 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen. * - * \param T the numeric type at hand + * \tparam T the numeric type at hand * * This class stores enums, typedefs and static methods giving information about a numeric type. * diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index aad4ccecd..b1fb455b9 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -13,12 +13,18 @@ namespace Eigen { +namespace internal { + +enum PermPermProduct_t {PermPermProduct}; + +} // end namespace internal + /** \class PermutationBase * \ingroup Core_Module * * \brief Base class for permutations * - * \param Derived the derived class + * \tparam Derived the derived class * * This class is the base class for all expressions representing a permutation matrix, * internally stored as a vector of integers. @@ -36,13 +42,6 @@ namespace Eigen { * * \sa class PermutationMatrix, class PermutationWrapper */ - -namespace internal { - -enum PermPermProduct_t {PermPermProduct}; - -} // end namespace internal - template class PermutationBase : public EigenBase { @@ -280,20 +279,6 @@ class PermutationBase : public EigenBase }; -/** \class PermutationMatrix - * \ingroup Core_Module - * - * \brief Permutation matrix - * - * \param SizeAtCompileTime the number of rows/cols, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * \param StorageIndex the integer type of the indices - * - * This class represents a permutation matrix, internally stored as a vector of integers. - * - * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix - */ - namespace internal { template struct traits > @@ -306,6 +291,19 @@ struct traits class PermutationMatrix : public PermutationBase > { @@ -482,18 +480,6 @@ class Map class TranspositionsWrapper; namespace internal { template @@ -513,6 +499,17 @@ struct traits > }; } +/** \class PermutationWrapper + * \ingroup Core_Module + * + * \brief Class to view a vector of integers as a permutation matrix + * + * \tparam _IndicesType the type of the vector of integer (can be any compatible expression) + * + * This class allows to view any vector expression of integers as a permutation matrix. + * + * \sa class PermutationBase, class PermutationMatrix + */ template class PermutationWrapper : public PermutationBase > { diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index fdd2fed3f..8aa1de081 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -14,22 +14,6 @@ namespace Eigen { template class ProductImpl; -/** \class Product - * \ingroup Core_Module - * - * \brief Expression of the product of two arbitrary matrices or vectors - * - * \param Lhs the type of the left-hand side expression - * \param Rhs the type of the right-hand side expression - * - * This class represents an expression of the product of two arbitrary matrices. - * - * The other template parameters are: - * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct - * - */ - - namespace internal { // Determine the scalar of Product. This is normally the same as Lhs::Scalar times @@ -102,7 +86,20 @@ struct traits > } // end namespace internal - +/** \class Product + * \ingroup Core_Module + * + * \brief Expression of the product of two arbitrary matrices or vectors + * + * \tparam _Lhs the type of the left-hand side expression + * \tparam _Rhs the type of the right-hand side expression + * + * This class represents an expression of the product of two arbitrary matrices. + * + * The other template parameters are: + * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct + * + */ template class Product : public ProductImpl<_Lhs,_Rhs,Option, typename internal::product_promote_storage_type::StorageKind, diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 61de5ed17..ae414204e 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -12,76 +12,6 @@ namespace Eigen { -/** \class Ref - * \ingroup Core_Module - * - * \brief A matrix or vector expression mapping an existing expression - * - * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. - * The default is \c #Unaligned. - * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), - * but accepts a variable outer stride (leading dimension). - * This can be overridden by specifying strides. - * The type passed here must be a specialization of the Stride template, see examples below. - * - * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. - * A Ref<> object can represent either a const expression or a l-value: - * \code - * // in-out argument: - * void foo1(Ref x); - * - * // read-only const argument: - * void foo2(const Ref& x); - * \endcode - * - * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. - * By default, a Ref can reference any dense vector expression of float having a contiguous memory layout. - * Likewise, a Ref can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with - * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) - * can be greater than the number of rows. - * - * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. - * Here are some examples: - * \code - * MatrixXf A; - * VectorXf a; - * foo1(a.head()); // OK - * foo1(A.col()); // OK - * foo1(A.row()); // Compilation error because here innerstride!=1 - * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object - * foo2(A.row().transpose()); // The row is copied into a contiguous temporary - * foo2(2*a); // The expression is evaluated into a temporary - * foo2(A.col().segment(2,4)); // No temporary - * \endcode - * - * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. - * Here is an example accepting an innerstride!=1: - * \code - * // in-out argument: - * void foo3(Ref > x); - * foo3(A.row()); // OK - * \endcode - * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more - * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a - * template function, e.g.: - * \code - * // in the .h: - * void foo(const Ref& A); - * void foo(const Ref >& A); - * - * // in the .cpp: - * template void foo_impl(const TypeOfA& A) { - * ... // crazy code goes here - * } - * void foo(const Ref& A) { foo_impl(A); } - * void foo(const Ref >& A) { foo_impl(A); } - * \endcode - * - * - * \sa PlainObjectBase::Map(), \ref TopicStorageOrders - */ - namespace internal { template @@ -182,7 +112,75 @@ protected: StrideBase m_stride; }; - +/** \class Ref + * \ingroup Core_Module + * + * \brief A matrix or vector expression mapping an existing expression + * + * \tparam PlainObjectType the equivalent matrix type of the mapped data + * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * The default is \c #Unaligned. + * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), + * but accepts a variable outer stride (leading dimension). + * This can be overridden by specifying strides. + * The type passed here must be a specialization of the Stride template, see examples below. + * + * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. + * A Ref<> object can represent either a const expression or a l-value: + * \code + * // in-out argument: + * void foo1(Ref x); + * + * // read-only const argument: + * void foo2(const Ref& x); + * \endcode + * + * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. + * By default, a Ref can reference any dense vector expression of float having a contiguous memory layout. + * Likewise, a Ref can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with + * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) + * can be greater than the number of rows. + * + * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. + * Here are some examples: + * \code + * MatrixXf A; + * VectorXf a; + * foo1(a.head()); // OK + * foo1(A.col()); // OK + * foo1(A.row()); // Compilation error because here innerstride!=1 + * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object + * foo2(A.row().transpose()); // The row is copied into a contiguous temporary + * foo2(2*a); // The expression is evaluated into a temporary + * foo2(A.col().segment(2,4)); // No temporary + * \endcode + * + * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. + * Here is an example accepting an innerstride!=1: + * \code + * // in-out argument: + * void foo3(Ref > x); + * foo3(A.row()); // OK + * \endcode + * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more + * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a + * template function, e.g.: + * \code + * // in the .h: + * void foo(const Ref& A); + * void foo(const Ref >& A); + * + * // in the .cpp: + * template void foo_impl(const TypeOfA& A) { + * ... // crazy code goes here + * } + * void foo(const Ref& A) { foo_impl(A); } + * void foo(const Ref >& A) { foo_impl(A); } + * \endcode + * + * + * \sa PlainObjectBase::Map(), \ref TopicStorageOrders + */ template class Ref : public RefBase > { diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index bec598310..9960ef884 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -12,21 +12,6 @@ namespace Eigen { -/** - * \class Replicate - * \ingroup Core_Module - * - * \brief Expression of the multiple replication of a matrix or vector - * - * \param MatrixType the type of the object we are replicating - * - * This class represents an expression of the multiple replication of a matrix or vector. - * It is the return type of DenseBase::replicate() and most of the time - * this is the only way it is used. - * - * \sa DenseBase::replicate() - */ - namespace internal { template struct traits > @@ -57,6 +42,22 @@ struct traits > }; } +/** + * \class Replicate + * \ingroup Core_Module + * + * \brief Expression of the multiple replication of a matrix or vector + * + * \tparam MatrixType the type of the object we are replicating + * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic. + * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic. + * + * This class represents an expression of the multiple replication of a matrix or vector. + * It is the return type of DenseBase::replicate() and most of the time + * this is the only way it is used. + * + * \sa DenseBase::replicate() + */ template class Replicate : public internal::dense_xpr_base< Replicate >::type { diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index 7feb6e01c..c44b7673b 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -13,11 +13,6 @@ namespace Eigen { -/** \class ReturnByValue - * \ingroup Core_Module - * - */ - namespace internal { template @@ -48,6 +43,10 @@ struct nested_eval, n, PlainObject> } // end namespace internal +/** \class ReturnByValue + * \ingroup Core_Module + * + */ template class ReturnByValue : public internal::dense_xpr_base< ReturnByValue >::type, internal::no_assignment_operator { diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index d7c380c78..0640cda2a 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -14,20 +14,6 @@ namespace Eigen { -/** \class Reverse - * \ingroup Core_Module - * - * \brief Expression of the reverse of a vector or matrix - * - * \param MatrixType the type of the object of which we are taking the reverse - * - * This class represents an expression of the reverse of a vector. - * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::reverse(), VectorwiseOp::reverse() - */ - namespace internal { template @@ -60,6 +46,20 @@ template struct reverse_packet_cond } // end namespace internal +/** \class Reverse + * \ingroup Core_Module + * + * \brief Expression of the reverse of a vector or matrix + * + * \tparam MatrixType the type of the object of which we are taking the reverse + * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections + * + * This class represents an expression of the reverse of a vector. + * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::reverse(), VectorwiseOp::reverse() + */ template class Reverse : public internal::dense_xpr_base< Reverse >::type { diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 9a2f4f1eb..513742f34 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -31,8 +31,8 @@ namespace Eigen { * arguments to the constructor. * * Indeed, this class takes two template parameters: - * \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. - * \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. + * \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. + * \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. * * Here is an example: * \include Map_general_stride.cpp diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 5b66eb5e1..f199d1086 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -13,20 +13,6 @@ namespace Eigen { -/** \class Transpose - * \ingroup Core_Module - * - * \brief Expression of the transpose of a matrix - * - * \param MatrixType the type of the object of which we are taking the transpose - * - * This class represents an expression of the transpose of a matrix. - * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::transpose(), MatrixBase::adjoint() - */ - namespace internal { template struct traits > : public traits @@ -50,6 +36,19 @@ struct traits > : public traits template class TransposeImpl; +/** \class Transpose + * \ingroup Core_Module + * + * \brief Expression of the transpose of a matrix + * + * \tparam MatrixType the type of the object of which we are taking the transpose + * + * This class represents an expression of the transpose of a matrix. + * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::transpose(), MatrixBase::adjoint() + */ template class Transpose : public TransposeImpl::StorageKind> { diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 3b1c1815d..678ba3288 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -12,35 +12,6 @@ namespace Eigen { -/** \class Transpositions - * \ingroup Core_Module - * - * \brief Represents a sequence of transpositions (row/column interchange) - * - * \param SizeAtCompileTime the number of transpositions, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * - * This class represents a permutation transformation as a sequence of \em n transpositions - * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices. - * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges - * the rows \c i and \c indices[i] of the matrix \c M. - * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange. - * - * Compared to the class PermutationMatrix, such a sequence of transpositions is what is - * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place. - * - * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example: - * \code - * Transpositions tr; - * MatrixXf mat; - * mat = tr * mat; - * \endcode - * In this example, we detect that the matrix appears on both side, and so the transpositions - * are applied in-place without any temporary or extra copy. - * - * \sa class PermutationMatrix - */ - template class TranspositionsBase { @@ -154,6 +125,35 @@ struct traits class Transpositions : public TranspositionsBase > { diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index 216c568c4..d72fbf7e9 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -13,13 +13,23 @@ namespace Eigen { +namespace internal { +template +struct traits > + : public traits::Flags & RowMajorBit ? 1 : Size, + traits::Flags & RowMajorBit ? Size : 1> > +{ +}; +} + /** \class VectorBlock * \ingroup Core_Module * * \brief Expression of a fixed-size or dynamic-size sub-vector * - * \param VectorType the type of the object in which we are taking a sub-vector - * \param Size size of the sub-vector we are taking at compile time (optional) + * \tparam VectorType the type of the object in which we are taking a sub-vector + * \tparam Size size of the sub-vector we are taking at compile time (optional) * * This class represents an expression of either a fixed-size or dynamic-size sub-vector. * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and @@ -43,17 +53,6 @@ namespace Eigen { * * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index) */ - -namespace internal { -template -struct traits > - : public traits::Flags & RowMajorBit ? 1 : Size, - traits::Flags & RowMajorBit ? Size : 1> > -{ -}; -} - template class VectorBlock : public Block::Flags & RowMajorBit ? 1 : Size, diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 483f71909..95bcaa86f 100755 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -141,8 +141,8 @@ struct member_redux { * * \brief Pseudo expression providing partial reduction operations * - * \param ExpressionType the type of the object on which to do partial reductions - * \param Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam ExpressionType the type of the object on which to do partial reductions + * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) * * This class represents a pseudo expression with partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() @@ -187,11 +187,11 @@ template class VectorwiseOp protected: - /** \internal - * \returns the i-th subvector according to the \c Direction */ typedef typename internal::conditional::type SubVector; + /** \internal + * \returns the i-th subvector according to the \c Direction */ EIGEN_DEVICE_FUNC SubVector subVector(Index i) { diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 6b93d5221..8b71e2c62 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -390,9 +390,9 @@ struct transfer_constness * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes * many coefficient accesses in the nested expressions -- as is the case with matrix product for example. * - * \param T the type of the expression being nested. - * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. - * \param PlainObject the type of the temporary if needed. + * \tparam T the type of the expression being nested. + * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. + * \tparam PlainObject the type of the temporary if needed. */ template::type> struct nested_eval { @@ -575,7 +575,7 @@ template struct product_promote_storage_type struct product_promote_storage_type { typedef Dense ret; }; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. - * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. + * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. */ template struct plain_row_type diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index dc9cd0a1a..469ea5e4e 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -375,8 +375,12 @@ namespace internal { * Performs a QR step on a tridiagonal symmetric matrix represented as a * pair of two vectors \a diag and \a subdiag. * - * \param matA the input selfadjoint matrix - * \param hCoeffs returned Householder coefficients + * \param diag the diagonal part of the input selfadjoint tridiagonal matrix + * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix + * \param start starting index of the submatrix to work on + * \param end last+1 index of the submatrix to work on + * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0 + * \param n size of the input matrix * * For compilation efficiency reasons, this procedure does not use eigen expression * for its arguments. @@ -467,9 +471,10 @@ namespace internal { * \brief Compute the eigendecomposition from a tridiagonal matrix * * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues - * \param[in] subdiag : The subdiagonal part of the matrix. - * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations. - * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input + * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition) + * \param[in] maxIterations : the maximum number of iterations + * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not + * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input. * \returns \c Success or \c NoConvergence */ template diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h index 2d076d7f8..cc89639b6 100644 --- a/Eigen/src/Geometry/Hyperplane.h +++ b/Eigen/src/Geometry/Hyperplane.h @@ -22,8 +22,8 @@ namespace Eigen { * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n. * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane. * - * \param _Scalar the scalar type, i.e., the type of the coefficients - * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. * Notice that the dimension of the hyperplane is _AmbientDim-1. * * This class represents an hyperplane as the zero set of the implicit equation diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h index 93edd9148..fdcd69760 100644 --- a/Eigen/src/Geometry/ParametrizedLine.h +++ b/Eigen/src/Geometry/ParametrizedLine.h @@ -23,8 +23,8 @@ namespace Eigen { * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$. * - * \param _Scalar the scalar type, i.e., the type of the coefficients - * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. */ template class ParametrizedLine diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h index 8b0ddcfb0..5ab0d5920 100644 --- a/Eigen/src/Geometry/Rotation2D.h +++ b/Eigen/src/Geometry/Rotation2D.h @@ -18,7 +18,7 @@ namespace Eigen { * * \brief Represents a rotation/orientation in a 2 dimensional space. * - * \param _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _Scalar the scalar type, i.e., the type of the coefficients * * This class is equivalent to a single scalar representing a counter clock wise rotation * as a single angle in radian. It provides some additional features such as the automatic diff --git a/Eigen/src/Geometry/RotationBase.h b/Eigen/src/Geometry/RotationBase.h index b88661de6..fadfd9151 100644 --- a/Eigen/src/Geometry/RotationBase.h +++ b/Eigen/src/Geometry/RotationBase.h @@ -22,8 +22,8 @@ struct rotation_base_generic_product_selector; * * \brief Common base class for compact rotation representations * - * \param Derived is the derived type, i.e., a rotation type - * \param _Dim the dimension of the space + * \tparam Derived is the derived type, i.e., a rotation type + * \tparam _Dim the dimension of the space */ template class RotationBase @@ -164,8 +164,8 @@ namespace internal { * * Helper function to return an arbitrary rotation object to a rotation matrix. * - * \param Scalar the numeric type of the matrix coefficients - * \param Dim the dimension of the current space + * \tparam Scalar the numeric type of the matrix coefficients + * \tparam Dim the dimension of the current space * * It returns a Dim x Dim fixed size matrix. * diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index e94aa189f..643138199 100644 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -18,7 +18,7 @@ namespace Eigen { * * \brief Represents a generic uniform scaling transformation * - * \param _Scalar the scalar type, i.e., the type of the coefficients. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients. * * This class represent a uniform scaling transformation. It is the return * type of Scaling(Scalar), and most of the time this is the only way it diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 7fda179cc..87ea445e9 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -18,8 +18,8 @@ namespace Eigen { * * \brief Represents a translation transformation * - * \param _Scalar the scalar type, i.e., the type of the coefficients. - * \param _Dim the dimension of the space, can be a compile time value or Dynamic + * \tparam _Scalar the scalar type, i.e., the type of the coefficients. + * \tparam _Dim the dimension of the space, can be a compile time value or Dynamic * * \note This class is not aimed to be used to store a translation transformation, * but rather to make easier the constructions and updates of Transform objects. diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 284e37f13..18e9d1466 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -21,7 +21,7 @@ namespace Eigen { * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with * Limited memory, SIAM J. Sci. Comput. 21(1), pp. 24-45, 1999 * - * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix + * \tparam Scalar the scalar type of the input matrices * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering, diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 0c4d63923..1721213d6 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -29,7 +29,7 @@ template struct traits > * * \brief LU decomposition of a matrix with complete pivoting, and related features * - * \param MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 50e920609..ab7797d2a 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -34,7 +34,7 @@ template struct traits > * * \brief LU decomposition of a matrix with partial pivoting, and related features * - * \param MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index 323255e0a..d1d08ca57 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -84,8 +84,11 @@ StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const S /** \internal * \ingroup OrderingMethods_Module * Approximate minimum degree ordering algorithm. - * \returns the permutation P reducing the fill-in of the input matrix \a C - * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional. + * + * \param[in] C the input selfadjoint matrix stored in compressed column major format. + * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C + * + * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries. * On exit the values of C are destroyed */ template void minimum_degree_ordering(SparseMatrix& C, PermutationMatrix& perm) diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 25792a828..7ea9b14d7 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -19,20 +19,21 @@ namespace internal { /** \internal * \ingroup OrderingMethods_Module - * \returns the symmetric pattern A^T+A from the input matrix A. + * \param[in] A the input non-symmetric matrix + * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A. * FIXME: The values should not be considered here */ template -void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat) +void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) { MatrixType C; - C = mat.transpose(); // NOTE: Could be costly + C = A.transpose(); // NOTE: Could be costly for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) it.valueRef() = 0.0; } - symmat = C + mat; + symmat = C + A; } } diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 172e4a89f..d8bd4b950 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -28,7 +28,7 @@ template struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R * such that diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index 64fe6b7b8..32a10f3fe 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -37,7 +37,7 @@ struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R * such that diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 1eb861025..03bc8e6cd 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -21,7 +21,7 @@ namespace Eigen { * * \brief Householder QR decomposition of a matrix * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R * such that diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 896246e46..3552c87bf 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -47,9 +47,8 @@ struct traits > * * \brief class Bidiagonal Divide and Conquer SVD * - * \param MatrixType the type of the matrix of which we are computing the SVD decomposition - * We plan to have a very similar interface to JacobiSVD on this class. - * It should be used to speed up the calcul of SVD for big matrices. + * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition + * */ template class BDCSVD : public SVDBase > diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 59c965e15..bf5ff48c3 100755 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -449,8 +449,8 @@ struct traits > * * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix * - * \param MatrixType the type of the matrix of which we are computing the SVD decomposition - * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally + * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition + * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally * for the R-SVD step for non-square matrices. See discussion of possible values below. * * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 1343eb15c..2907f6529 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -39,18 +39,16 @@ namespace internal { } // end namespace internal /** \ingroup SparseCholesky_Module - * \brief A direct sparse Cholesky factorizations + * \brief A base class for direct sparse Cholesky factorizations * - * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are - * selfadjoint and positive definite. The factorization allows for solving A.X = B where + * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are + * selfadjoint and positive definite. These factorizations allow for solving A.X = B where * X and B can be either dense or sparse. * * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization * such that the factorized matrix is P A P^-1. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam Derived the type of the derived class, that is the actual factorization type. * */ template diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index e71a13b89..8c1b3e8bc 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -14,22 +14,21 @@ namespace Eigen { namespace internal { -/** - * \brief Performs numeric block updates from a given supernode to a single column - * - * \param segsize Size of the segment (and blocks ) to use for updates - * \param[in,out] dense Packed values of the original matrix - * \param tempv temporary vector to use for updates - * \param lusup array containing the supernodes - * \param lda Leading dimension in the supernode - * \param nrow Number of rows in the rectangular part of the supernode - * \param lsub compressed row subscripts of supernodes - * \param lptr pointer to the first column of the current supernode in lsub - * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode - * \return 0 on success - */ template struct LU_kernel_bmod { + /** \internal + * \brief Performs numeric block updates from a given supernode to a single column + * + * \param segsize Size of the segment (and blocks ) to use for updates + * \param[in,out] dense Packed values of the original matrix + * \param tempv temporary vector to use for updates + * \param lusup array containing the supernodes + * \param lda Leading dimension in the supernode + * \param nrow Number of rows in the rectangular part of the supernode + * \param lsub compressed row subscripts of supernodes + * \param lptr pointer to the first column of the current supernode in lsub + * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode + */ template static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros); diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h index eb3e17330..ee97299af 100644 --- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h +++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h @@ -95,10 +95,10 @@ template struct GoogleSparseHashMapTraits * * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access * - * \param SparseMatrixType the type of the sparse matrix we are updating - * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage. + * \tparam SparseMatrixType the type of the sparse matrix we are updating + * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage. * Its default value depends on the system. - * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object + * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object * as a power of two exponent. * * This class temporarily represents a sparse matrix object using a generic map implementation allowing for -- cgit v1.2.3 From 715f6f049fb449df57484a36711d9fde8a5d1be0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 3 Jan 2016 21:56:30 +0100 Subject: Improve inline documentation of SparseCompressedBase and its derived classes --- Eigen/src/Core/Ref.h | 1 + Eigen/src/SparseCore/SparseCompressedBase.h | 10 +++++++ Eigen/src/SparseCore/SparseMap.h | 41 ++++++++++++++++++++++++++++- Eigen/src/SparseCore/SparseRef.h | 25 +++++++++++++----- 4 files changed, 69 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index ae414204e..6e94181f3 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -207,6 +207,7 @@ template class Ref EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) #else + /** Implicit constructor from any dense expression */ template inline Ref(DenseBase& expr) #endif diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index c223e4f42..f78d7c24d 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -22,6 +22,16 @@ struct traits > : traits } // end namespace internal +/** \ingroup SparseCore_Module + * \class SparseCompressedBase + * \brief Common base class for sparse [compressed]-{row|column}-storage format. + * + * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as: + * - SparseMatrix + * - Ref + * - Map + * + */ template class SparseCompressedBase : public SparseMatrixBase diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index 36c09ab0c..eb241c3e2 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -37,11 +37,15 @@ struct traits, Options, St }; } // end namespace internal - + template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class SparseMapBase; +/** \ingroup SparseCore_Module + * class SparseMapBase + * \brief Common base class for Map and Ref instance of sparse matrix and vector. + */ template class SparseMapBase : public SparseCompressedBase @@ -71,22 +75,33 @@ class SparseMapBase public: + /** \copydoc SparseMatrixBase::rows() */ inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + /** \copydoc SparseMatrixBase::cols() */ inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + /** \copydoc SparseMatrixBase::innerSize() */ inline Index innerSize() const { return m_innerSize; } + /** \copydoc SparseMatrixBase::outerSize() */ inline Index outerSize() const { return m_outerSize; } + /** \copydoc SparseCompressedBase::nonZeros */ inline Index nonZeros() const { return m_zero_nnz[1]; } + /** \copydoc SparseCompressedBase::isCompressed */ bool isCompressed() const { return m_innerNonZeros==0; } //---------------------------------------- // direct access interface + /** \copydoc SparseMatrix::valuePtr */ inline const Scalar* valuePtr() const { return m_values; } + /** \copydoc SparseMatrix::innerIndexPtr */ inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } + /** \copydoc SparseMatrix::outerIndexPtr */ inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } + /** \copydoc SparseMatrix::innerNonZeroPtr */ inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; } //---------------------------------------- + /** \copydoc SparseMatrix::coeff */ inline Scalar coeff(Index row, Index col) const { const Index outer = IsRowMajor ? row : col; @@ -125,6 +140,10 @@ class SparseMapBase inline SparseMapBase() {} }; +/** \ingroup SparseCore_Module + * class SparseMapBase + * \brief Common base class for writable Map and Ref instance of sparse matrix and vector. + */ template class SparseMapBase : public SparseMapBase @@ -185,9 +204,23 @@ class SparseMapBase inline SparseMapBase() {} }; +/** \ingroup SparseCore_Module + * + * \brief Specialization of class Map for SparseMatrix-like storage. + * + * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix. + * + * \sa class Map, class SparseMatrix, class Ref + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Map, Options, StrideType> : public SparseMapBase, Options, StrideType> > +#else +template +class Map + : public SparseMapBase +#endif { public: typedef SparseMapBase Base; @@ -196,6 +229,12 @@ class Map, Options, StrideType> public: + /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, + * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. + * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * + * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages". + */ inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h index 605ca42ba..a558230e7 100644 --- a/Eigen/src/SparseCore/SparseRef.h +++ b/Eigen/src/SparseCore/SparseRef.h @@ -108,20 +108,25 @@ protected: /** - * \ingroup Sparse_Module + * \ingroup SparseCore_Module * * \brief A sparse matrix expression referencing an existing sparse expression * - * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data + * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix. * \tparam Options specifies whether the a standard compressed format is required \c Options is \c #StandardCompressedFormat, or \c 0. * The default is \c 0. - * \tparam StrideType Only used for dense Ref * * \sa class Ref */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Ref, Options, StrideType > : public internal::SparseRefBase, Options, StrideType > > +#else +template +class Ref + : public SparseMapBase // yes, that's weird to use Derived here, but that works! +#endif { typedef SparseMatrix PlainObjectType; typedef internal::traits Traits; @@ -155,6 +160,7 @@ class Ref, Options, StrideType > template inline Ref(const SparseCompressedBase& expr) #else + /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */ template inline Ref(SparseCompressedBase& expr) #endif @@ -225,19 +231,23 @@ class Ref, Options, StrideType /** - * \ingroup Sparse_Module + * \ingroup SparseCore_Module * * \brief A sparse vector expression referencing an existing sparse vector expression * - * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data - * \tparam Options Not used for SparseVector. - * \tparam StrideType Only used for dense Ref + * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector. * * \sa class Ref */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Ref, Options, StrideType > : public internal::SparseRefBase, Options, StrideType > > +#else +template +class Ref + : public SparseMapBase +#endif { typedef SparseVector PlainObjectType; typedef internal::traits Traits; @@ -259,6 +269,7 @@ class Ref, Options, StrideType > template inline Ref(const SparseCompressedBase& expr) #else + /** Implicit constructor from any 1D sparse vector expression */ template inline Ref(SparseCompressedBase& expr) #endif -- cgit v1.2.3 From 515dee0bafc69dd79cfbf9c2831e707d2214390f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Jan 2016 16:29:26 -0800 Subject: Added a 'divup' util to compute the floor of the quotient of two integers --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index f28a9699d..d6ad65070 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,6 +24,11 @@ const T2& choose(Cond, const T1&, const T2& second) { return second; } +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return (x + y - 1) / y; +} + template struct max_n_1 { static const size_t size = n; }; -- cgit v1.2.3 From cfff40b1d48999d354be34c984c83f7d0f1ca5cb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Jan 2016 17:25:00 -0800 Subject: Improved the performance of reductions on CUDA devices --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 31 +++++++ .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 101 ++++++++++++++++++++- 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c30980a49..2ecdccdfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -337,6 +337,16 @@ struct FullReducer { #endif +// Default inner reducer +template +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + assert(false && "Not implemented"); + } +}; + // Default outer reducer template struct OuterReducer { @@ -352,6 +362,9 @@ struct OuterReducer { template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +template +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif @@ -516,6 +529,23 @@ struct TensorEvaluator, Device> // Attempt to use an optimized reduction. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reducedDims[i]; + } else { + reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + Op reducer(m_reducer); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return false; + } + bool preserving_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -615,6 +645,7 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 8e250867c..b046607b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -131,7 +131,102 @@ struct FullReducer { } }; -#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +extern __shared__ float temp[]; + +template +__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + float reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) { + const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + + temp[threadIdx.x] = reduced_val; + + __syncthreads(); + const int warp_id = threadIdx.x & 31; + if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]); + if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]); + if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]); + if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]); + if (warp_id < 1) { + reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]); + atomicReduce(&(output[row]), temp[threadIdx.x], reducer); + } + } + + __syncthreads(); + } +} + +template +struct InnerReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same::value; + + template + static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce floats on a gpu device"); + } + + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = 32; + + LAUNCH_CUDA_KERNEL((InnerReductionKernel), + num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + } +}; + template @@ -145,7 +240,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu } // Do the reduction. - const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + const Index max_iter = divup(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; @@ -189,8 +284,6 @@ struct OuterReducer { } }; -#undef DIVUP - #endif -- cgit v1.2.3 From 54bf582303407387f9a34d99c8993aa3255274ec Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 6 Jan 2016 11:59:24 +0100 Subject: bug #1143: Work-around gcc bug --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 6238676e5..70c987afa 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -516,7 +516,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ Col [col].start = p [col] ; Col [col].length = p [col+1] - p [col] ; - if (Col [col].length < 0) + if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; -- cgit v1.2.3 From ee738321aa6c13f327821f4a4b1aaa4ead635687 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 6 Jan 2016 14:49:40 +0100 Subject: rm remaining debug code --- Eigen/src/Core/util/Constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 9e6816021..5f71ba3df 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -224,7 +224,7 @@ enum { /** \ingroup enums * Enum for indicating whether a buffer is aligned or not. */ -enum Foo { +enum { Unaligned=0, /**< Data pointer has no specific alignment. */ Aligned8=8, /**< Data pointer is aligned on a 8 bytes boundary. */ Aligned16=16, /**< Data pointer is aligned on a 16 bytes boundary. */ -- cgit v1.2.3 From 213459d81850f98f3822624ae84c1f420f12092c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Jan 2016 18:47:45 -0800 Subject: Optimized the performance of broadcasting of scalars. --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index dc64959e1..0c95e5c0b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -46,6 +46,21 @@ struct nested, 1, typename eval type; }; +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct is_input_scalar > { + static const bool value = (Sizes::total_size == 1); +}; +#endif + } // end namespace internal @@ -103,7 +118,7 @@ struct TensorEvaluator, Device> // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const InputDimensions& input_dims = m_impl.dimensions(); const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -143,6 +158,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + if (static_cast(Layout) == static_cast(ColMajor)) { return coeffColMajor(index); } else { @@ -214,6 +233,10 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } else { -- cgit v1.2.3 From 0cb2ca5de216f258cd3106cf18d4aada9064ce13 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Jan 2016 18:50:28 -0800 Subject: Fixed a typo. --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 0c95e5c0b..9ec20a99e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -234,7 +234,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { if (internal::is_input_scalar::type>::value) { - return m_impl.coeff(0); + return internal::pset1(m_impl.coeff(0)); } if (static_cast(Layout) == static_cast(ColMajor)) { -- cgit v1.2.3 From 6639b7d6e86ef36f6f78cf51e36efa5a004154eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Jan 2016 18:45:19 -0800 Subject: Removed a couple of partial specialization that confuse nvcc and result in errors such as this: error: more than one partial specialization matches the template argument list of class "Eigen::internal::get<3, Eigen::internal::numeric_list>" "Eigen::internal::get>" "Eigen::internal::get>" --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 3f149c6a3..4d99f786c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -109,11 +109,9 @@ template struct get; template struct get> : get> {}; template struct get<0, type_list> { typedef a type; }; -template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; template struct get<0, numeric_list> { constexpr static T value = a; }; -template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ -- cgit v1.2.3 From f9d71a172992cfda5e2733f9f4a6e12a14b9ed73 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 8 Jan 2016 22:24:45 +0100 Subject: extend matlab conversion table --- doc/AsciiQuickReference.txt | 110 +++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index b5bdfa1f4..c604e575c 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -32,17 +32,19 @@ A << 1, 2, 3, // Initialize A. The elements can also be B << A, A, A; // B is three horizontally stacked A's. A.fill(10); // Fill A with all 10's. -// Eigen // Matlab -MatrixXd::Identity(rows,cols) // eye(rows,cols) -C.setIdentity(rows,cols) // C = eye(rows,cols) -MatrixXd::Zero(rows,cols) // zeros(rows,cols) -C.setZero(rows,cols) // C = ones(rows,cols) -MatrixXd::Ones(rows,cols) // ones(rows,cols) -C.setOnes(rows,cols) // C = ones(rows,cols) -MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). -C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 -VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' -v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' +// Eigen // Matlab +MatrixXd::Identity(rows,cols) // eye(rows,cols) +C.setIdentity(rows,cols) // C = eye(rows,cols) +MatrixXd::Zero(rows,cols) // zeros(rows,cols) +C.setZero(rows,cols) // C = ones(rows,cols) +MatrixXd::Ones(rows,cols) // ones(rows,cols) +C.setOnes(rows,cols) // C = ones(rows,cols) +MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). +C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 +VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' +v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' +VectorXi::LinSpaced(((hi-low)/step)+1, // low:step:hi + low,low+step*(size-1)) // Matrix slicing and blocks. All expressions listed here are read/write. @@ -85,13 +87,15 @@ P.bottomRightCorner() // P(end-rows+1:end, end-cols+1:end) R.row(i) = P.col(j); // R(i, :) = P(:, i) R.col(j1).swap(mat1.col(j2)); // R(:, [j1 j2]) = R(:, [j2, j1]) -// Views, transpose, etc; all read-write except for .adjoint(). +// Views, transpose, etc; // Eigen // Matlab R.adjoint() // R' -R.transpose() // R.' or conj(R') -R.diagonal() // diag(R) +R.transpose() // R.' or conj(R') // Read-write +R.diagonal() // diag(R) // Read-write x.asDiagonal() // diag(x) -R.transpose().colwise().reverse(); // rot90(R) +R.transpose().colwise().reverse() // rot90(R) // Read-write +R.replicate(i,j) // repmat(P,i,j) + // All the same as Matlab, but matlab doesn't have *= style operators. // Matrix-vector. Matrix-matrix. Matrix-scalar. @@ -103,37 +107,39 @@ a *= M; R = P + Q; R = P/s; R -= Q; R /= s; // Vectorized operations on each element independently -// Eigen // Matlab -R = P.cwiseProduct(Q); // R = P .* Q -R = P.array() * s.array();// R = P .* s -R = P.cwiseQuotient(Q); // R = P ./ Q -R = P.array() / Q.array();// R = P ./ Q -R = P.array() + s.array();// R = P + s -R = P.array() - s.array();// R = P - s -R.array() += s; // R = R + s -R.array() -= s; // R = R - s -R.array() < Q.array(); // R < Q -R.array() <= Q.array(); // R <= Q -R.cwiseInverse(); // 1 ./ P -R.array().inverse(); // 1 ./ P -R.array().sin() // sin(P) -R.array().cos() // cos(P) -R.array().pow(s) // P .^ s -R.array().square() // P .^ 2 -R.array().cube() // P .^ 3 -R.cwiseSqrt() // sqrt(P) -R.array().sqrt() // sqrt(P) -R.array().exp() // exp(P) -R.array().log() // log(P) -R.cwiseMax(P) // max(R, P) -R.array().max(P.array()) // max(R, P) -R.cwiseMin(P) // min(R, P) -R.array().min(P.array()) // min(R, P) -R.cwiseAbs() // abs(P) -R.array().abs() // abs(P) -R.cwiseAbs2() // abs(P.^2) -R.array().abs2() // abs(P.^2) -(R.array() < s).select(P,Q); // (R < s ? P : Q) +// Eigen // Matlab +R = P.cwiseProduct(Q); // R = P .* Q +R = P.array() * s.array(); // R = P .* s +R = P.cwiseQuotient(Q); // R = P ./ Q +R = P.array() / Q.array(); // R = P ./ Q +R = P.array() + s.array(); // R = P + s +R = P.array() - s.array(); // R = P - s +R.array() += s; // R = R + s +R.array() -= s; // R = R - s +R.array() < Q.array(); // R < Q +R.array() <= Q.array(); // R <= Q +R.cwiseInverse(); // 1 ./ P +R.array().inverse(); // 1 ./ P +R.array().sin() // sin(P) +R.array().cos() // cos(P) +R.array().pow(s) // P .^ s +R.array().square() // P .^ 2 +R.array().cube() // P .^ 3 +R.cwiseSqrt() // sqrt(P) +R.array().sqrt() // sqrt(P) +R.array().exp() // exp(P) +R.array().log() // log(P) +R.cwiseMax(P) // max(R, P) +R.array().max(P.array()) // max(R, P) +R.cwiseMin(P) // min(R, P) +R.array().min(P.array()) // min(R, P) +R.cwiseAbs() // abs(P) +R.array().abs() // abs(P) +R.cwiseAbs2() // abs(P.^2) +R.array().abs2() // abs(P.^2) +(R.array() < s).select(P,Q ); // (R < s ? P : Q) +R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) + // Reductions. int r, c; @@ -164,12 +170,12 @@ x.dot(y) // dot(x, y) x.cross(y) // cross(x, y) Requires #include //// Type conversion -// Eigen // Matlab -A.cast(); // double(A) -A.cast(); // single(A) -A.cast(); // int32(A) -A.real(); // real(A) -A.imag(); // imag(A) +// Eigen // Matlab +A.cast(); // double(A) +A.cast(); // single(A) +A.cast(); // int32(A) +A.real(); // real(A) +A.imag(); // imag(A) // if the original type equals destination type, no work is done // Note that for most operations Eigen requires all operands to have the same type: -- cgit v1.2.3 From 53749ff4152191d2f1bd56090a14f6474fe059c2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 13:53:40 -0800 Subject: Prevent nvcc from miscompiling the cuda metakernel. Unfortunately this reintroduces some compulation warnings but it's much better than having to deal with random assertion failures. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 6 +----- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 ++++------------ unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4d7570077..af140a68b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -238,14 +238,10 @@ struct GpuDevice { }; -#ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else -#define LAUNCH_CUDA_KERNEL(...) \ - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif + // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index c28078882..d93e1de1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template class TensorExecutor { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,9 +213,8 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -228,17 +227,13 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } /*static*/ template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } #endif // __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index b046607b9..558d0c83d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -115,11 +115,11 @@ struct FullReducer { internal::is_same::value; template - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { assert(false && "Should only be called on floats"); } - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); -- cgit v1.2.3 From 3358dfd5dd6c98fda9be133d1c4958ac00221006 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:28:53 -0800 Subject: Reworked the dispatch of optimized cuda reduction kernels to workaround a nvcc bug that prevented the code from compiling in optimized mode in some cases --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2ecdccdfd..1c721de6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -506,7 +506,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type CoeffReturnType; typedef typename internal::remove_const::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -527,7 +527,6 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -537,12 +536,12 @@ struct TensorEvaluator, Device> reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; } } - if (internal::InnerReducer::HasOptimizedImplementation && + if (internal::InnerReducer::HasOptimizedImplementation && (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); return false; } @@ -554,16 +553,15 @@ struct TensorEvaluator, Device> preserving_inner_dims &= m_reducedDims[i]; } } - if (internal::OuterReducer::HasOptimizedImplementation && + if (internal::OuterReducer::HasOptimizedImplementation && preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); return false; } } -#endif return true; } -- cgit v1.2.3 From d726e864ac91a1b5d5128972ee9e169966f2fa51 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:38:14 -0800 Subject: Made it possible to use array of size 0 on CUDA devices --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index ab9c2ec3e..456b34d0b 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -132,13 +132,13 @@ template class array { return *static_cast(NULL); } - static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - array(std::initializer_list l) { + EIGEN_DEVICE_FUNC array(std::initializer_list l) { eigen_assert(l.size() == 0); } #endif -- cgit v1.2.3 From e76904af1b0f6aef80b68492534404f534ce0240 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:50:57 -0800 Subject: Simplified the dispatch code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 1c721de6d..fd7064459 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -342,7 +342,7 @@ template struct InnerReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { assert(false && "Not implemented"); } }; @@ -352,7 +352,7 @@ template struct OuterReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { assert(false && "Not implemented"); } }; @@ -527,6 +527,7 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. +#if 0 else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -562,6 +563,7 @@ struct TensorEvaluator, Device> return false; } } +#endif return true; } -- cgit v1.2.3 From 8b9dc9f0dfb44c2c4ad6d02fb88ecce0986cd154 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 9 Jan 2016 08:30:38 +0100 Subject: bug #1144: fix regression in x=y+A*x (aliasing), and move evaluator_traits::AssumeAliasing to evaluator_assume_aliasing. --- Eigen/src/Core/AssignEvaluator.h | 14 +++++------ Eigen/src/Core/CoreEvaluators.h | 8 +++---- Eigen/src/Core/ProductEvaluators.h | 25 +++++++++----------- Eigen/src/Core/SelfAdjointView.h | 2 -- Eigen/src/Core/TriangularMatrix.h | 4 ---- Eigen/src/Geometry/Homogeneous.h | 1 - Eigen/src/SparseCore/SparseSelfAdjointView.h | 2 -- Eigen/src/SparseQR/SparseQR.h | 1 - test/product.h | 35 +++++++++++++++++++++------- test/product_large.cpp | 23 ++++++++++++++++++ test/product_notemporary.cpp | 6 +++++ 11 files changed, 77 insertions(+), 44 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 9dfffbcc4..f6632de69 100755 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -682,9 +682,9 @@ template< typename DstXprType, typename SrcXprType, typename Functor, struct Assignment; -// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition. -// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated. -// So this intermediate function removes everything related to AssumeAliasing such that Assignment +// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition. +// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated. +// So this intermediate function removes everything related to "assume-aliasing" such that Assignment // does not has to bother about these annoying details. template @@ -698,21 +698,21 @@ EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src) call_assignment(dst, src, internal::assign_op()); } -// Deal with AssumeAliasing +// Deal with "assume-aliasing" template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::AssumeAliasing==1, void*>::type = 0) +EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) { typename plain_matrix_type::type tmp(src); call_assignment_no_alias(dst, tmp, func); } template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::AssumeAliasing==0, void*>::type = 0) +EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) { call_assignment_no_alias(dst, src, func); } -// by-pass AssumeAliasing +// by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template class StorageBase, typename Src, typename Func> EIGEN_DEVICE_FUNC void call_assignment(NoAlias& dst, const Src& src, const Func& func) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index f97dc33de..8bd73b814 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -63,10 +63,6 @@ struct evaluator_traits_base // by default, get evaluator kind and shape from storage typedef typename storage_kind_to_evaluator_kind::StorageKind>::Kind Kind; typedef typename storage_kind_to_shape::StorageKind>::Shape Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; // Default evaluator traits @@ -75,6 +71,10 @@ struct evaluator_traits : public evaluator_traits_base { }; +template::Shape > +struct evaluator_assume_aliasing { + static const bool value = false; +}; // By default, we assume a unary expression: template diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 794038a2a..b2a0a4b4f 100755 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -38,10 +38,9 @@ struct evaluator > // Catch scalar * ( A * B ) and transform it to (A*scalar) * B // TODO we should apply that rule only if that's really helpful template -struct evaluator_traits, const Product > > - : evaluator_traits_base, const Product > > +struct evaluator_assume_aliasing, const Product > > { - enum { AssumeAliasing = 1 }; + static const bool value = true; }; template struct evaluator, const Product > > @@ -81,17 +80,8 @@ template< typename Lhs, typename Rhs, struct generic_product_impl; template -struct evaluator_traits > - : evaluator_traits_base > -{ - enum { AssumeAliasing = 1 }; -}; - -template -struct evaluator_traits > - : evaluator_traits_base > -{ - enum { AssumeAliasing = 0 }; +struct evaluator_assume_aliasing > { + static const bool value = true; }; // This is the default evaluator implementation for products: @@ -189,6 +179,13 @@ struct Assignment" expression to save one temporary // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct +// TODO enable it for "Dense ?= xpr - Product<>" as well. + +template +struct evaluator_assume_aliasing, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; template struct assignment_from_xpr_plus_product diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 87e87ab3a..e709eb213 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -203,8 +203,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SelfAdjointShape Shape; - - static const int AssumeAliasing = 0; }; template diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 7d6a97848..27845e89c 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -704,10 +704,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef typename glue_shapes::Shape, TriangularShape>::type Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; template diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h index 367fd3930..cd52b5470 100644 --- a/Eigen/src/Geometry/Homogeneous.h +++ b/Eigen/src/Geometry/Homogeneous.h @@ -304,7 +304,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef HomogeneousShape Shape; - static const int AssumeAliasing = 0; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 46c6ce1d3..975cefd28 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -211,8 +211,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SparseSelfAdjointShape Shape; - - static const int AssumeAliasing = 0; }; struct SparseSelfAdjoint2Sparse {}; diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 4f26c19ca..0d448d02e 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -691,7 +691,6 @@ struct evaluator_traits > typedef typename SparseQRType::MatrixType MatrixType; typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SparseShape Shape; - static const int AssumeAliasing = 0; }; template< typename DstXprType, typename SparseQRType> diff --git a/test/product.h b/test/product.h index 9dfff9303..bd92309d2 100644 --- a/test/product.h +++ b/test/product.h @@ -145,14 +145,31 @@ template void product(const MatrixType& m) VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval()); // inner product - Scalar x = square2.row(c) * square2.col(c2); - VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum()); - + { + Scalar x = square2.row(c) * square2.col(c2); + VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum()); + } + // outer product - VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose()); - VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols)); + { + VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose()); + VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols)); + } + + // Aliasing + { + ColVectorType x(cols); x.setRandom(); + ColVectorType z(x); + ColVectorType y(cols); y.setZero(); + ColSquareMatrixType A(cols,cols); A.setRandom(); + // CwiseBinaryOp + VERIFY_IS_APPROX(x = y + A*x, A*z); + x = z; + // CwiseUnaryOp + VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z); + } } diff --git a/test/product_large.cpp b/test/product_large.cpp index 7207973c2..98f84c53b 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -9,6 +9,27 @@ #include "product.h" +template +void test_aliasing() +{ + int rows = internal::random(1,12); + int cols = internal::random(1,12); + typedef Matrix MatrixType; + typedef Matrix VectorType; + VectorType x(cols); x.setRandom(); + VectorType z(x); + VectorType y(rows); y.setZero(); + MatrixType A(rows,cols); A.setRandom(); + // CwiseBinaryOp + VERIFY_IS_APPROX(x = y + A*x, A*z); // OK because "y + A*x" is marked as "assume-aliasing" + x = z; + // CwiseUnaryOp + VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression + x = z; + // VERIFY_IS_APPROX(x = y-A*x, -A*z); // Not OK in 3.3 because x is resized before A*x gets evaluated + x = z; +} + void test_product_large() { for(int i = 0; i < g_repeat; i++) { @@ -17,6 +38,8 @@ void test_product_large() CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + + CALL_SUBTEST_1( test_aliasing() ); } #if defined EIGEN_TEST_PART_6 diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index ff93cb881..5a3f3a01a 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -43,10 +43,16 @@ template void product_notemporary(const MatrixType& m) r1 = internal::random(8,rows-r0); VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1); + VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0); + VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1); +// VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0); + VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1); + + VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0); VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0); VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0); -- cgit v1.2.3 From 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 Mon Sep 17 00:00:00 2001 From: Jeremy Barnes Date: Sun, 10 Jan 2016 22:39:13 -0500 Subject: Alternative way of forcing instantiation of device kernels without causing warnings or requiring device to device kernel invocations. This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 10 ++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index af140a68b..359a01b8f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -242,6 +242,16 @@ struct GpuDevice { (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); +#ifndef __CUDA_ARCH__ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); +#else +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); +#endif + // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fd7064459..9a66e81f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -506,7 +506,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type CoeffReturnType; typedef typename internal::remove_const::type PacketReturnType; - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 558d0c83d..374edb605 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -116,7 +116,7 @@ struct FullReducer { template static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - assert(false && "Should only be called on floats"); + eigen_assert(false && "Should only be called on floats"); } static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { @@ -126,7 +126,7 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel), + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; -- cgit v1.2.3 From 91678f489a89b1f4a393cd3b703e67922b5c4257 Mon Sep 17 00:00:00 2001 From: Jeremy Barnes Date: Sun, 10 Jan 2016 22:44:45 -0500 Subject: Cleaned up double-defined macro from last commit --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 359a01b8f..c74613873 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -238,10 +238,6 @@ struct GpuDevice { }; -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - #ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ -- cgit v1.2.3 From 780623261eedd996404795dfb7928e680408adb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 09:07:14 -0800 Subject: Re-enabled the optimized reduction CUDA code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fd7064459..cea32d05f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -527,7 +527,6 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. -#if 0 else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -563,7 +562,6 @@ struct TensorEvaluator, Device> return false; } } -#endif return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 558d0c83d..198b3604c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel), + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; @@ -222,7 +222,7 @@ struct InnerReducer { const int num_per_thread = 128; const int num_blocks = 32; - LAUNCH_CUDA_KERNEL((InnerReductionKernel), + LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; @@ -279,7 +279,7 @@ struct OuterReducer { device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((OuterReductionKernel), + LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; -- cgit v1.2.3 From 2ccb1c86342203cbc25a587590149d2cf5175900 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 10:36:37 -0800 Subject: Fixed a bug in the dispatch of optimized reduction kernels. --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index cea32d05f..7bd2326b0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -438,19 +438,18 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Bitmap indicating if an input dimension is reduced or not. - array reduced; + // Build the bitmap indicating if an input dimension is reduced or not. for (int i = 0; i < NumInputDims; ++i) { - reduced[i] = false; + m_reduced[i] = false; } for (int i = 0; i < NumReducedDims; ++i) { eigen_assert(op.dims()[i] >= 0); eigen_assert(op.dims()[i] < NumInputDims); - reduced[op.dims()[i]] = true; + m_reduced[op.dims()[i]] = true; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer::run(input_dims, reduced, &m_dimensions, &m_reducedDims); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); // Precompute output strides. if (NumOutputDims > 0) { @@ -485,7 +484,7 @@ struct TensorEvaluator, Device> int outputIndex = 0; int reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { + if (m_reduced[i]) { m_reducedStrides[reduceIndex] = input_strides[i]; ++reduceIndex; } else { @@ -531,9 +530,9 @@ struct TensorEvaluator, Device> bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { - reducing_inner_dims &= m_reducedDims[i]; + reducing_inner_dims &= m_reduced[i]; } else { - reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; } } if (internal::InnerReducer::HasOptimizedImplementation && @@ -548,9 +547,9 @@ struct TensorEvaluator, Device> bool preserving_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { - preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; } else { - preserving_inner_dims &= m_reducedDims[i]; + preserving_inner_dims &= m_reduced[i]; } } if (internal::OuterReducer::HasOptimizedImplementation && @@ -689,6 +688,8 @@ struct TensorEvaluator, Device> return startInput; } + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. -- cgit v1.2.3 From b523771a24320014abfec537b0f4b568c19882eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 14:25:43 -0800 Subject: Silenced several compilation warnings triggered by nvcc. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 48 ++++++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 ++-- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 12 +++--- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index c74613873..0f67f0f57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -10,7 +10,6 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - namespace Eigen { // This defines an interface that GPUDevice can take to use @@ -206,20 +205,45 @@ struct GpuDevice { #endif } - inline int getNumCudaMultiProcessors() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().multiProcessorCount; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerMultiProcessor() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerMultiProcessor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int sharedMemPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().sharedMemPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int majorDeviceVersion() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().major; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } // This function checks if the CUDA runtime recorded an error for the @@ -239,13 +263,13 @@ struct GpuDevice { }; #ifndef __CUDA_ARCH__ -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); #else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); #endif @@ -260,4 +284,4 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93e1de1b..d2ab70f2b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,7 +213,7 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -232,7 +232,7 @@ inline void TensorExecutor::run(const Expression& /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 3fa3d5c3c..867654aff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -115,8 +115,8 @@ struct FullReducer { internal::is_same::value; template - static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - eigen_assert(false && "Should only be called on floats"); + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(false && "Should only be called on floats"); } static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { @@ -210,11 +210,11 @@ struct InnerReducer { internal::is_same::value; template - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -264,11 +264,11 @@ struct OuterReducer { internal::is_same::value; template - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; -- cgit v1.2.3 From 0504c56ea7b1dc4d804580cbc498b189ffd06b6e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 15:49:21 -0800 Subject: Silenced a nvcc compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 867654aff..91b6e68dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -119,7 +119,7 @@ struct FullReducer { assert(false && "Should only be called on floats"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); -- cgit v1.2.3 From 01c55d37e69ca3c45f4390b7e15c310028e8b8ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 15:53:19 -0800 Subject: Deleted unused variable. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 91b6e68dd..59404dba5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -217,7 +217,6 @@ struct InnerReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = 32; -- cgit v1.2.3 From 4f7714d72cb8a34dd42081a6ece310c984392354 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 16:01:00 -0800 Subject: Enabled the use of fixed dimensions from within a cuda kernel. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index f3c9a3148..2692563a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list { return internal::arg_prod(Indices...); } - Sizes() { } + EIGEN_DEVICE_FUNC Sizes() { } template - explicit Sizes(const array& /*indices*/) { + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex...) { } - explicit Sizes(std::initializer_list /*l*/) { + template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { // todo: add assertion } #endif -- cgit v1.2.3 From f894736d61198eef9d6463d3cdde8dbe1171a56b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 16:42:18 -0800 Subject: Updated the tensor traits: the alignment is not part of the Flags enum anymore --- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 7a9568b36..2f06f8442 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -20,7 +20,7 @@ class compute_tensor_flags enum { is_dynamic_size_storage = 1, - aligned_bit = + is_aligned = ( ((Options&DontAlign)==0) && ( #if EIGEN_MAX_STATIC_ALIGN_BYTES>0 @@ -35,12 +35,12 @@ class compute_tensor_flags 0 #endif ) - ) ? AlignedBit : 0, - packet_access_bit = packet_traits::Vectorizable && aligned_bit ? PacketAccessBit : 0 + ), + packet_access_bit = packet_traits::Vectorizable && is_aligned ? PacketAccessBit : 0 }; public: - enum { ret = packet_access_bit | aligned_bit}; + enum { ret = packet_access_bit}; }; @@ -86,7 +86,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = Options_, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; @@ -102,7 +102,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; -- cgit v1.2.3 From c5e6900400663901013dc48a3492e756b21c131e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:06:39 -0800 Subject: Silenced a few compilation warnings. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 6 +++++- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 90ee50678..126a9e4ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1261,7 +1261,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -1274,7 +1274,7 @@ struct TensorEvaluatorm_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { @@ -1313,10 +1313,11 @@ struct TensorEvaluator + template EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) // rows in left side const Index m = this->m_i_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 0f67f0f57..5abdc489b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -275,10 +275,14 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ -static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 59404dba5..89f055134 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -220,6 +220,8 @@ struct InnerReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = 32; + EIGEN_UNUSED_VARIABLE(block_size) + EIGEN_UNUSED_VARIABLE(num_blocks) LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); -- cgit v1.2.3 From bbdabbb37976a37ab585d266691100bdd035f30b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:26:56 -0800 Subject: Made the blas utils usable from within a cuda kernel --- Eigen/src/Core/util/BlasUtil.h | 42 +++++++++++++++++++++--------------------- Eigen/src/Core/util/Memory.h | 7 ++++--- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index d00fa9707..498db3a70 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -123,18 +123,18 @@ template struct get_factor::R template class BlasVectorMapper { public: - EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; } template - EIGEN_ALWAYS_INLINE Packet load(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const { return ploadt(m_data + i); } template - bool aligned(Index i) const { + EIGEN_DEVICE_FUNC bool aligned(Index i) const { return (size_t(m_data+i)%sizeof(Packet))==0; } @@ -148,25 +148,25 @@ class BlasLinearMapper { typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE void prefetch(int i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } - EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { return ploadt(m_data + i); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { return ploadt(m_data + i); } - EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { pstoret(m_data + i, p); } @@ -184,18 +184,18 @@ class blas_data_mapper { typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; - EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_ALWAYS_INLINE blas_data_mapper + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper getSubMapper(Index i, Index j) const { return blas_data_mapper(&operator()(i, j), m_stride); } - EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { return LinearMapper(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(&operator()(i, j)); } @@ -205,28 +205,28 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { return ploadt(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { return ploadt(&operator()(i, j)); } template - EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); } template - EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { return pgather(&operator()(i, j), m_stride); } - const Index stride() const { return m_stride; } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } - Index firstAligned(Index size) const { + EIGEN_DEVICE_FUNC Index firstAligned(Index size) const { if (size_t(m_data)%sizeof(Scalar)) { return -1; } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 1a899ea6c..823e077af 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -524,7 +524,7 @@ template EIGEN_DEVICE_FUNC inline void conditional_align * \sa first_default_aligned() */ template -inline Index first_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) { static const Index ScalarSize = sizeof(Scalar); static const Index AlignmentSize = Alignment / ScalarSize; @@ -544,14 +544,15 @@ inline Index first_aligned(const Scalar* array, Index size) } else { - return std::min( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size); + Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask; + return (first < size) ? first : size; } } /** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement. * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase) */ template -inline Index first_default_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size) { typedef typename packet_traits::type DefaultPacketType; return first_aligned::alignment>(array, size); -- cgit v1.2.3 From bd7d901da9bd824ee2a7a94d3b0c8668d77f5ff2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:49:44 -0800 Subject: Reverted a previous change that tripped nvcc when compiling in debug mode. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 126a9e4ad..a5f3debc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1274,7 +1274,7 @@ struct TensorEvaluatorm_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { @@ -1313,7 +1313,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC + template void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; @@ -1362,7 +1362,7 @@ struct TensorEvaluator), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); } else { - const Index m_blocks = (m + 127) / 128; + const Index m_blocks = (m + 127) / 128; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 block_size(8, 32, 1); -- cgit v1.2.3 From d920d57f38e07739403a6c1e224c74fec5a36e6f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Jan 2016 11:32:27 -0800 Subject: Improved the performance of the contraction of a 2d tensor with a 1d tensor by a factor of 3 or more. This helps speedup LSTM neural networks. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 48 ++++++++++++---------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index eda93a1de..63d0c6f68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -32,7 +32,7 @@ enum { template + int packet_size, bool inner_dim_contiguous, int Alignment> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -144,11 +144,11 @@ class SimpleTensorContractionMapper { return IndexPair(linidx[0], linidx[1]); } - Index firstAligned(Index size) const { - return size; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + return (Alignment == Aligned) ? 0 : size; } - Index stride() const { - return 1; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; } protected: @@ -165,10 +165,10 @@ template - class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -181,6 +181,7 @@ template::type Packet; typedef typename packet_traits::half HalfPacket; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { // whole method makes column major assumption @@ -192,7 +193,7 @@ templatecomputeIndex(i, j); eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); + return this->m_tensor.template packet(index); } const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); @@ -207,7 +208,7 @@ template::value <= 1 || !inner_dim_reordered) && (last - first) == (packet_size - 1)) { - return this->m_tensor.template packet(first); + return this->m_tensor.template packet(first); } EIGEN_ALIGN_MAX Scalar data[packet_size]; @@ -223,6 +224,7 @@ template(data); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { // whole method makes column major assumption @@ -230,7 +232,7 @@ template::size; if (half_packet_size == packet_size) { - return loadPacket(i, j); + return loadPacket(i, j); } EIGEN_ALIGN_MAX Scalar data[half_packet_size]; for (Index k = 0; k < half_packet_size; k++) { @@ -246,10 +248,10 @@ template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -260,13 +262,13 @@ class BaseTensorContractionMapper::type Packet; - EIGEN_DEVICE_FUNC + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); return pload::type>(data); } - EIGEN_DEVICE_FUNC + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { return loadPacket(i, j); } @@ -304,14 +306,14 @@ class TensorContractionSubMapper { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { @@ -325,12 +327,12 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); - return loadPacket(i); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } template - EIGEN_DEVICE_FUNC bool aligned(Index) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } @@ -741,17 +743,19 @@ struct TensorContractionEvaluatorBase typedef TensorEvaluator RightEvaluator; const Index lhs_packet_size = internal::packet_traits::size; const Index rhs_packet_size = internal::packet_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; typedef internal::TensorContractionInputMapper LhsMapper; + false, lhs_alignment> LhsMapper; typedef internal::TensorContractionInputMapper RhsMapper; + rhs_inner_dim_reordered, rhs_alignment> RhsMapper; LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); -- cgit v1.2.3 From 79b69b7444cfae2f7631e873e822cdca6f4e355f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Jan 2016 15:21:09 -0800 Subject: Trigger the optimized matrix vector path more conservatively. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 63d0c6f68..72a378dfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -145,7 +145,10 @@ class SimpleTensorContractionMapper { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - return (Alignment == Aligned) ? 0 : size; + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; -- cgit v1.2.3 From 9f013a9d86ad5cf82939bfeab2223652a821c448 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Jan 2016 14:24:37 -0800 Subject: Properly record the rank of reduced tensors in the tensor traits. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c783aab97..781a37e34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -134,7 +134,7 @@ struct traits > : public traits::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 8028e71c0..2dc8815b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -24,11 +24,14 @@ template struct traits > : traits { - typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; }; template -- cgit v1.2.3 From 8fe2532e70a8e0261717003d96d4df41ab978756 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Jan 2016 09:29:48 -0800 Subject: Fixed a boundary condition bug in the outer reduction kernel --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 89f055134..54ab34ba1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -241,7 +241,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu } // Do the reduction. - const Index max_iter = divup(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + const Index max_iter = num_preserved_coeffs * numext::maxi(1, (num_coeffs_to_reduce - NumPerThread + 1)); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; -- cgit v1.2.3 From aed4cb1269d52d0ff0e69c8aa6d89c804185b18f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Jan 2016 21:45:14 -0800 Subject: Use warp shuffles instead of shared memory access to speedup the inner reduction kernel. --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 54ab34ba1..82ea09f07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -132,8 +132,6 @@ struct FullReducer { }; -extern __shared__ float temp[]; - template __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, @@ -183,17 +181,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } - temp[threadIdx.x] = reduced_val; - - __syncthreads(); - const int warp_id = threadIdx.x & 31; - if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]); - if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]); - if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]); - if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]); - if (warp_id < 1) { - reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]); - atomicReduce(&(output[row]), temp[threadIdx.x], reducer); +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); } } @@ -224,7 +218,7 @@ struct InnerReducer { EIGEN_UNUSED_VARIABLE(num_blocks) LAUNCH_CUDA_KERNEL((InnerReductionKernel), - num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; -- cgit v1.2.3 From 0461f0153ef66fb95d16b620675c97107ef7a342 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Jan 2016 11:22:16 -0800 Subject: Made it possible to compare tensor dimensions inside a CUDA kernel. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 2692563a9..52569a359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -405,20 +405,20 @@ template struct sizes_match_below_dim { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return false; } }; template struct sizes_match_below_dim { - static inline bool run(Dims1& dims1, Dims2& dims2) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { return (array_get(dims1) == array_get(dims2)) & sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return true; } }; @@ -427,7 +427,7 @@ struct sizes_match_below_dim { template -bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } -- cgit v1.2.3 From 34057cff23bc3b9d59117af9bea0b3e098e3eaea Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Jan 2016 15:11:56 -0800 Subject: Fixed a race condition that could affect some reductions on CUDA devices. --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 72 ++++++++++++++++++---- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 82ea09f07..2da18b147 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -76,13 +76,24 @@ __device__ inline void atomicReduce(T* output, T accum, SumReducer&) { #endif } + +template +__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output) { const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - if (first_index == 0) { + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (gridDim.x == 1 && first_index == 0) { *output = reducer.initialize(); } @@ -126,6 +137,14 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + 1, 32, 0, device, reducer.initialize(), 1, output); + } + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } @@ -150,8 +169,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { @@ -211,11 +233,25 @@ struct InnerReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = 32; - EIGEN_UNUSED_VARIABLE(block_size) - EIGEN_UNUSED_VARIABLE(num_blocks) + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); @@ -229,9 +265,11 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - // Initialize the output values - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } } // Do the reduction. @@ -266,14 +304,26 @@ struct OuterReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int dyn_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } -- cgit v1.2.3 From 6a75e7e0d5505bac0e1a5be39d37f2717ab03134 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Fri, 15 Jan 2016 16:32:21 -0800 Subject: Digamma cleanup * Added permission from cephes author to use his code * Cleanup in ArrayCwiseUnaryOps --- Eigen/src/Core/SpecialFunctions.h | 32 +++++++++++++++++++++++--------- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 3 --- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 8cf26f4d1..bd022946c 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -13,6 +13,29 @@ namespace Eigen { namespace internal { +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + namespace cephes { /* polevl (modified for Eigen) @@ -178,11 +201,6 @@ struct digamma_impl { * message condition value returned * psi singularity x integer <=0 INFINITY */ - /* - Cephes Math Library Release 2.2: June, 1992 - Copyright 1984, 1987, 1992 by Stephen L. Moshier - Direct inquiries to 30 Frost Street, Cambridge, MA 02140 - */ EIGEN_DEVICE_FUNC static float run(float xx) { float p, q, nz, x, s, w, y, z; @@ -297,10 +315,6 @@ struct digamma_impl { * psi singularity x integer <=0 INFINITY */ - /* - * Cephes Math Library Release 2.8: June, 2000 - * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier - */ double p, q, nz, s, w, y, z; bool negative; diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index e818ac588..2ce7414a1 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -320,9 +320,6 @@ lgamma() const } /** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma). - * - * Example: \include Cwise_digamma.cpp - * Output: \verbinclude Cwise_digamma.out * * \sa cos(), sin(), tan() */ -- cgit v1.2.3 From 63fb66f53a576e4ae7bd6b28d011a7e33b7757de Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Sun, 17 Jan 2016 21:25:36 -0700 Subject: Add ctor for long --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 4f2adb671..19352eb5e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -40,6 +40,12 @@ struct TensorUInt128 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(unsigned int x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(long x) : high(0), low(x) { + eigen_assert(x >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(unsigned long x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); } -- cgit v1.2.3 From 5b7713dd33f5d16282f69f10c33a739554762782 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:05:10 -0800 Subject: Record whether the underlying tensor storage can be accessed directly during the evaluation of an expression. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 4 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 + 28 files changed, 53 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index ad525bac8..dc6ca4909 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -78,7 +78,8 @@ class Tensor : public TensorBase0) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; static const int Options = Options_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 781a37e34..f1ec04c49 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -89,6 +89,7 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -210,6 +211,7 @@ struct TensorEvaluator, Devi BlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index a41d4d265..10fac0cc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : @@ -152,6 +153,8 @@ struct TensorEvaluator, Device> return m_leftImpl.template packet(index); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + private: TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9ec20a99e..efca7cd79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -109,6 +109,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index abc3c92ca..a209e885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -145,6 +145,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -304,6 +305,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 3d153bb94..f57d2bb7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -125,6 +125,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -287,6 +288,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 3ca7daf32..877bcd0df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -162,6 +162,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess && internal::type_casting_traits::VectorizedCast, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a82bfc0aa..367a152a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -306,6 +306,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -752,6 +753,7 @@ struct TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0157f6fab..0f8a98caf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Devi BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) @@ -250,6 +251,7 @@ struct TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index ff4373f59..e7daf7304 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -98,6 +98,7 @@ struct TensorEvaluator, Device> PacketAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -140,7 +141,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 902f25247..f726585b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -148,6 +149,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -207,6 +209,7 @@ struct TensorEvaluator, Device> PacketAccess = internal::functor_traits::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC @@ -257,6 +260,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -312,6 +316,7 @@ struct TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -378,6 +383,7 @@ struct TensorEvaluator internal::packet_traits::HasBlend, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 215a4ebad..3bfaf6d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -135,6 +135,7 @@ struct TensorEvaluator, D BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a4d6ce6b3..28cb9f02d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, - }; + RawAccess = true + }; typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c16bf7e67..c9b0b2f28 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -92,6 +92,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, + RawAccess = true }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9316c9831..96f74b992 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 11e510414..2ab332add 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -168,6 +168,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = NumDims == 5, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index ae9e9f751..52d89ad01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,7 +91,8 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented - }; + RawAccess = false + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index f612bbd45..a37516974 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -123,6 +123,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 5c759af09..6f69da34a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -49,7 +49,8 @@ template class TensorMap : public Tensor IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits::size > 1), Layout = PlainObjectType::Layout, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d8c923d74..0524cf0d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -110,6 +110,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -170,6 +171,7 @@ template PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -317,6 +319,7 @@ struct TensorEvaluator, Devi PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -545,6 +548,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 91e32d200..39a305a93 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -93,6 +93,7 @@ struct TensorEvaluator, Device PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 8fb53f4f2..2cbb820b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -94,6 +94,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2dc8815b8..09ee0c2c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -428,6 +428,7 @@ struct TensorEvaluator, Device> PacketAccess = Self::InputPacketAccess && Op::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 6b25b2ba0..57197d060 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -139,6 +139,7 @@ template class TensorRef : public TensorBase, Device> PacketAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) @@ -412,6 +414,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 10328c61f..846f81e0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -239,6 +240,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 15a22aa1b..c4adb7d4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device> PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -225,6 +226,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 97b6168a9..2c2eb6515 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -112,6 +112,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -258,6 +259,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 6625c66d5..52b78b261 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -181,6 +181,7 @@ struct TensorEvaluator, D BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = NumDims == 6, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) -- cgit v1.2.3 From b3b722905f3df26a34cdda4f2cee74aa62403040 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:09:47 -0800 Subject: Improved code indentation --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 28cb9f02d..70282dd83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -54,7 +54,7 @@ class TensorFixedSize : public TensorBase m_storage; public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } -- cgit v1.2.3 From 6d472d83754e0f16db1deb69218e10c2b21268b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:22:05 -0800 Subject: Moved the contraction mapping code to its own file to make the code more manageable. --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorContraction.h | 357 ------------------- .../CXX11/src/Tensor/TensorContractionMapper.h | 377 +++++++++++++++++++++ 3 files changed, 378 insertions(+), 357 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7481a9ddb..1c5734383 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorReductionCuda.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionCuda.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 72a378dfd..506696ae9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,363 +21,6 @@ namespace Eigen { */ namespace internal { -enum { - Rhs = 0, - Lhs = 1, -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ -template -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - Index nocontract_val = left ? row : col; - Index linidx = 0; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if(array_size::value > 0) { - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - return IndexPair(linidx[0], linidx[1]); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - // Only claim alignment when we can compute the actual stride (ie when we're - // dealing with the lhs with inner_dim_contiguous. This is because the - // matrix-vector product relies on the stride when dealing with aligned inputs. - return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; - } - - protected: - const Tensor m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); - } - - const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index last = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { - - return this->m_tensor.template packet(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(last); - - return pload(data); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); - } -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits::type Packet; - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload::type>(data); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } -}; - -template -class TensorContractionInputMapper; - -template -class TensorContractionSubMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - - typedef TensorContractionInputMapper ParentMapper; - typedef TensorContractionSubMapper Self; - typedef Self LinearMapper; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { - return false; - } - - private: - const ParentMapper& m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template -class TensorContractionInputMapper - : public BaseTensorContractionMapper { - - public: - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } -}; - - - template struct traits > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 000000000..b25b34d61 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,377 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if(array_size::value > 0) { + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits::type Packet; + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload::type>(data); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + +template +class TensorContractionInputMapper; + +template +class TensorContractionSubMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H -- cgit v1.2.3 From df79c00901e2c976f79a0b1518ed9a58a48f0501 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:24:08 -0800 Subject: Improved the formatting of the code --- unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 52d89ad01..2798956ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -92,7 +92,7 @@ struct TensorEvaluator, Device> Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false - }; + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) -- cgit v1.2.3 From 2832175a689313ba08523489a1a1b8bb6458ac5c Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 19 Jan 2016 20:12:17 -0700 Subject: Use explicitly 32 bit integer types in constructors. --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 19352eb5e..f43f64cde 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -34,17 +34,11 @@ struct TensorUInt128 LOW low; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int x) : high(0), low(x) { + TensorUInt128(int32_t x) : high(0), low(x) { eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned int x) : high(0), low(x) { } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(long x) : high(0), low(x) { - eigen_assert(x >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned long x) : high(0), low(x) { } + TensorUInt128(uint32_t x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); -- cgit v1.2.3 From 915e7667cd64d45fe6b07d40b0c04d2209a5f6de Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 19 Jan 2016 21:17:29 -0700 Subject: Remove executable bit from header files --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/arch/AltiVec/PacketMath.h | 0 Eigen/src/Core/arch/SSE/PacketMath.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 Eigen/src/Core/util/BlasUtil.h | 0 Eigen/src/Core/util/DisableStupidWarnings.h | 0 Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 bench/btl/generic_bench/timers/portable_timer.hh | 0 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 0 22 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Core/AssignEvaluator.h mode change 100755 => 100644 Eigen/src/Core/Assign_MKL.h mode change 100755 => 100644 Eigen/src/Core/ProductEvaluators.h mode change 100755 => 100644 Eigen/src/Core/VectorwiseOp.h mode change 100755 => 100644 Eigen/src/Core/arch/AltiVec/PacketMath.h mode change 100755 => 100644 Eigen/src/Core/arch/SSE/PacketMath.h mode change 100755 => 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h mode change 100755 => 100644 Eigen/src/Core/util/BlasUtil.h mode change 100755 => 100644 Eigen/src/Core/util/DisableStupidWarnings.h mode change 100755 => 100644 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealQZ.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100755 => 100644 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100755 => 100644 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100755 => 100644 Eigen/src/SVD/JacobiSVD.h mode change 100755 => 100644 Eigen/src/SparseLU/SparseLU.h mode change 100755 => 100644 bench/btl/generic_bench/timers/portable_timer.hh mode change 100755 => 100644 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100755 new mode 100644 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100755 new mode 100644 diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh old mode 100755 new mode 100644 diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h old mode 100755 new mode 100644 -- cgit v1.2.3 From 234a1094b7839017e6cf8e8f376995ee13775e00 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 09:18:44 +0100 Subject: Add static assertion to y(), z(), w() accessors --- Eigen/src/Core/DenseCoeffsBase.h | 36 ++++++++++++++++++++++++++++++------ Eigen/src/Core/util/StaticAssert.h | 1 + 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 28cc1432c..423ab167d 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -191,19 +191,31 @@ class DenseCoeffsBase : public EigenBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - y() const { return (*this)[1]; } + y() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - z() const { return (*this)[2]; } + z() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - w() const { return (*this)[3]; } + w() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } /** \internal * \returns the packet of coefficients starting at the given row and column. It is your responsibility @@ -424,19 +436,31 @@ class DenseCoeffsBase : public DenseCoeffsBase=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - z() { return (*this)[2]; } + z() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - w() { return (*this)[3]; } + w() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } }; /** \brief Base class providing direct read-only coefficient access to matrices and arrays. diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 1fe365aa7..e174509e0 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -50,6 +50,7 @@ THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, + OUT_OF_RANGE_ACCESS, YOU_MADE_A_PROGRAMMING_MISTAKE, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, -- cgit v1.2.3 From 0b7169d1f70b0c18ae14d2a2bb5f165b7b675d51 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:15:59 +0100 Subject: bug #1147: fix compilation of PastixSupport --- Eigen/PaStiXSupport | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport index 3411dface..de3a63b4d 100644 --- a/Eigen/PaStiXSupport +++ b/Eigen/PaStiXSupport @@ -12,7 +12,6 @@ #include "src/Core/util/DisableStupidWarnings.h" -#include extern "C" { #include #include -- cgit v1.2.3 From db237d0c75abe70cf8ed0f21e7eaa66474f8a9bd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:49:01 +0100 Subject: bug #1145: fix PastixSupport LLT/LDLT wrappers (missing resize prior to calls to selfAdjointView) --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index c8cb2c0cc..d21495e81 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -581,6 +581,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > void grabMatrix(const MatrixType& matrix, ColSpMatrix& out) { + out.resize(matrix.rows(), matrix.cols()); // Pastix supports only lower, column-major matrices out.template selfadjointView() = matrix.template selfadjointView(); internal::c_to_fortran_numbering(out); @@ -666,6 +667,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> > void grabMatrix(const MatrixType& matrix, ColSpMatrix& out) { // Pastix supports only lower, column-major matrices + out.resize(matrix.rows(), matrix.cols()); out.template selfadjointView() = matrix.template selfadjointView(); internal::c_to_fortran_numbering(out); } -- cgit v1.2.3 From 4c5e96aab6baf32b077d8824b4ecbe592caaa8a0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:56:17 +0100 Subject: bug #1148: silent Pastix by default --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index d21495e81..7c8622c97 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -268,7 +268,7 @@ void PastixBase::init() 0, 0, 0, 1, m_iparm.data(), m_dparm.data()); m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO; - m_iparm[IPARM_VERBOSE] = 2; + m_iparm[IPARM_VERBOSE] = API_VERBOSE_NOT; m_iparm[IPARM_ORDERING] = API_ORDER_SCOTCH; m_iparm[IPARM_INCOMPLETE] = API_NO; m_iparm[IPARM_OOC_LIMIT] = 2000; -- cgit v1.2.3 From ed8ade9c65bc25f2226946e4275d62f7ceb28213 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 19:01:24 +0100 Subject: bug #1149: fix Pastix*::*parm() --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 6 +++--- test/pastix_support.cpp | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 7c8622c97..d2ebfd7bb 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -184,7 +184,7 @@ class PastixBase : public SparseSolverBase * The statistics related to the different phases of factorization and solve are saved here as well * \sa analyzePattern() factorize() */ - Array& dparm() + Array& dparm() { return m_dparm; } @@ -244,8 +244,8 @@ class PastixBase : public SparseSolverBase mutable ComputationInfo m_info; mutable pastix_data_t *m_pastixdata; // Data structure for pastix mutable int m_comm; // The MPI communicator identifier - mutable Matrix m_iparm; // integer vector for the input parameters - mutable Matrix m_dparm; // Scalar vector for the input parameters + mutable Array m_iparm; // integer vector for the input parameters + mutable Array m_dparm; // Scalar vector for the input parameters mutable Matrix m_perm; // Permutation vector mutable Matrix m_invp; // Inverse permutation vector mutable int m_size; // Size of the matrix diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp index 49239e3a5..b62f85739 100644 --- a/test/pastix_support.cpp +++ b/test/pastix_support.cpp @@ -27,6 +27,14 @@ template void test_pastix_T() check_sparse_spd_solving(pastix_llt_upper); check_sparse_spd_solving(pastix_ldlt_upper); check_sparse_square_solving(pastix_lu); + + // Some compilation check: + pastix_llt_lower.iparm(); + pastix_llt_lower.dparm(); + pastix_ldlt_lower.iparm(); + pastix_ldlt_lower.dparm(); + pastix_lu.iparm(); + pastix_lu.dparm(); } // There is no support for selfadjoint matrices with PaStiX. -- cgit v1.2.3 From 47076bf00ec32c41c0a7f2ba438361ea5f0256e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Jan 2016 14:51:48 -0800 Subject: Reduce the register pressure exerted by the tensor mappers whenever possible. This improves the performance of the contraction of a matrix with a vector by about 35%. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 5 +- .../CXX11/src/Tensor/TensorContractionMapper.h | 109 ++++++++++++++++++--- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 506696ae9..575ae7b54 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -128,6 +128,7 @@ struct TensorContractionEvaluatorBase PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; // Most of the code is assuming that both input tensors are ColMajor. If the @@ -434,11 +435,11 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } protected: // Prevent assignment diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index b25b34d61..9b6d18090 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,6 +22,54 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ + +template struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet(index); + } + + + private: + const Tensor m_tensor; +}; + +template struct CoeffLoader { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro(m_data + index); + } + private: + typedef typename Tensor::Scalar Scalar; + const Scalar* m_data; +}; + template::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } @@ -148,7 +204,7 @@ class SimpleTensorContractionMapper { } protected: - const Tensor m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; @@ -270,12 +326,6 @@ class BaseTensorContractionMapper -class TensorContractionInputMapper; template::type Packet; typedef typename packet_traits::half HalfPacket; - typedef TensorContractionInputMapper ParentMapper; + typedef BaseTensorContractionMapper ParentMapper; typedef TensorContractionSubMapper Self; typedef Self LinearMapper; + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets + }; + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } return m_base_mapper(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadHalfPacket(i, 0); + } return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); } @@ -324,6 +408,9 @@ class TensorContractionSubMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } @@ -333,7 +420,7 @@ class TensorContractionSubMapper { } private: - const ParentMapper& m_base_mapper; + ParentMapper m_base_mapper; const Index m_vert_offset; const Index m_horiz_offset; }; -- cgit v1.2.3 From 62f7e777117d35ee4d45e77af429246b1e4e1c12 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 00:02:59 +0100 Subject: add upper|lower case in incomplete_cholesky unit test --- test/incomplete_cholesky.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 435e2839a..7acad9872 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -15,16 +15,18 @@ template void test_incomplete_cholesky_T() { typedef SparseMatrix SparseMatrixType; - ConjugateGradient > > cg_illt_lower_amd; - ConjugateGradient > > cg_illt_lower_nat; - ConjugateGradient > > cg_illt_upper_amd; - ConjugateGradient > > cg_illt_upper_nat; + ConjugateGradient > > cg_illt_lower_amd; + ConjugateGradient > > cg_illt_lower_nat; + ConjugateGradient > > cg_illt_upper_amd; + ConjugateGradient > > cg_illt_upper_nat; + ConjugateGradient > > cg_illt_uplo_amd; CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) ); + CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) ); } void test_incomplete_cholesky() -- cgit v1.2.3 From 7ce932edd33c56baab0a5c5a8e0608c6345efd53 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Jan 2016 18:12:08 -0800 Subject: Small cleanup and small fix to the contraction of row major tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 575ae7b54..624e814e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -147,8 +147,6 @@ struct TensorContractionEvaluatorBase static const int ContractDims = internal::array_size::value; static const int NumDims = max_n_1::size; - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; typedef array contract_t; typedef array::size> left_nocontract_t; typedef array::size> right_nocontract_t; @@ -195,8 +193,8 @@ struct TensorContractionEvaluatorBase // We need to flip all the pairs of contracting indices as well as // reversing the dimensions. for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; } } @@ -504,9 +502,6 @@ struct TensorEvaluator::Dimensions>::value; static const int ContractDims = internal::array_size::value; - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - typedef array contract_t; typedef array::size> left_nocontract_t; typedef array::size> right_nocontract_t; -- cgit v1.2.3 From 690bc950f70c61075d396671e63480bbd64bb297 Mon Sep 17 00:00:00 2001 From: Jan Prach Date: Wed, 20 Jan 2016 19:35:59 -0800 Subject: fix clang warnings "braces around scalar initializer" --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 8 ++++---- unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h | 4 ++-- unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h | 2 +- unsupported/test/cxx11_tensor_broadcasting.cpp | 6 +++--- unsupported/test/cxx11_tensor_contraction.cpp | 2 +- unsupported/test/cxx11_tensor_map.cpp | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 4d99f786c..c1c57041f 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -404,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { template constexpr inline array h_array_zip(array a, array b, numeric_list) { - return array{{ Op::run(array_get(a), array_get(b))... }}; + return array{ Op::run(array_get(a), array_get(b))... }; } template @@ -432,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array a, array b) -> decl template constexpr inline array h_array_apply(array a, numeric_list) { - return array{{ Op::run(array_get(a))... }}; + return array{ Op::run(array_get(a))... }; } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index dc6ca4909..092e30c1f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -119,7 +119,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return coeff(array{firstIndex, secondIndex, otherIndices...}); } #endif @@ -159,7 +159,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return coeffRef(array{firstIndex, secondIndex, otherIndices...}); } #endif @@ -199,7 +199,7 @@ class Tensor : public TensorBaseoperator()(array{{firstIndex, secondIndex, otherIndices...}}); + return this->operator()(array{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -266,7 +266,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return operator()(array{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -342,7 +342,7 @@ class Tensor : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) + : m_storage(internal::array_prod(array{firstDimension, otherDimensions...}), array{firstDimension, otherDimensions...}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -427,7 +427,7 @@ class Tensor : public TensorBase{{firstDimension, otherDimensions...}}); + resize(array{firstDimension, otherDimensions...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 52569a359..06cac3570 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -289,7 +289,7 @@ struct DSizes : array { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - (*this) = array{{firstDimension, otherDimensions...}}; + (*this) = array{firstDimension, otherDimensions...}; } #else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 70282dd83..7d0858d02 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -73,7 +73,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return coeff(array{firstIndex, otherIndices...}); } #endif @@ -105,7 +105,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return coeffRef(array{firstIndex, otherIndices...}); } #endif @@ -137,7 +137,7 @@ class TensorFixedSize : public TensorBaseoperator()(array{{firstIndex, otherIndices...}}); + return this->operator()(array{firstIndex, otherIndices...}); } #endif @@ -176,7 +176,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return operator()(array{firstIndex, otherIndices...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 6f69da34a..7233f4c89 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -141,10 +141,10 @@ template class TensorMap : public Tensor { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); return m_data[index]; } } @@ -228,10 +228,10 @@ template class TensorMap : public Tensor static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index bc4f2025f..1b9fe2779 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -55,7 +55,7 @@ class DynamicSGroup inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array{firstIndex, otherIndices...}); } template @@ -90,7 +90,7 @@ class DynamicSGroup template inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const { - return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; + return std::array{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }; } template diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 942293bd7..255c344b4 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -217,7 +217,7 @@ class StaticSGroup inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array{firstIndex, otherIndices...}); } template diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 2ddf47234..6fdefd66c 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting() TensorFixedSize, DataLayout> t2; t2 = t2.constant(20.0f); - Tensor t3 = t1 + t2.broadcast(Eigen::array{{10}}); + Tensor t3 = t1 + t2.broadcast(Eigen::array{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); } - TensorMap, DataLayout> > t4(t2.data(), {{1}}); - Tensor t5 = t1 + t4.broadcast(Eigen::array{{10}}); + TensorMap, DataLayout> > t4(t2.data(), {1}); + Tensor t5 = t1 + t4.broadcast(Eigen::array{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index b0d52c6cf..c5f3af73e 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -456,7 +456,7 @@ static void test_tensor_product() mat1.setRandom(); mat2.setRandom(); - Tensor result = mat1.contract(mat2, Eigen::array{{}}); + Tensor result = mat1.contract(mat2, Eigen::array{}); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index a8a095e38..dc0f8a5a2 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -130,7 +130,7 @@ static void test_3d() } TensorMap> mat3(mat1.data(), 2, 3, 7); - TensorMap> mat4(mat2.data(), array{{2, 3, 7}}); + TensorMap> mat4(mat2.data(), array{2, 3, 7}); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); -- cgit v1.2.3 From 34340458cbe33976559bf8fd73a9d4b2f747d611 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 14:29:45 +0100 Subject: bug #1151: remove useless critical section --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d830dfb96..d77fc2630 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -145,12 +145,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 - #pragma omp critical - { for(Index i=0; i Date: Thu, 21 Jan 2016 20:18:51 +0100 Subject: Add numext::sqrt function to enable custom optimized implementation. This changeset add two specializations for float/double on SSE. Those are mostly usefull with GCC for which std::sqrt add an extra and costly check on the result of _mm_sqrt_*. Clang does not add this burden. In this changeset, only DenseBase::norm() makes use of it. --- Eigen/src/Core/Dot.h | 3 +-- Eigen/src/Core/MathFunctions.h | 20 ++++++++++++++++++-- Eigen/src/Core/arch/SSE/MathFunctions.h | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index c5040c67b..ce42854cd 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -99,8 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala template inline typename NumTraits::Scalar>::Real MatrixBase::norm() const { - EIGEN_USING_STD_MATH(sqrt) - return sqrt(squaredNorm()); + return numext::sqrt(squaredNorm()); } /** \returns an expression of the quotient of *this by its own norm. diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4d5e1acb8..1c7b28a4b 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -954,8 +954,8 @@ T (ceil)(const T& x) return ceil(x); } -// Log base 2 for 32 bits positive integers. -// Conveniently returns 0 for x==0. +/** Log base 2 for 32 bits positive integers. + * Conveniently returns 0 for x==0. */ inline int log2(int x) { eigen_assert(x>=0); @@ -969,6 +969,22 @@ inline int log2(int x) return table[(v * 0x07C4ACDDU) >> 27]; } +/** \returns the square root of \a x. + * + * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * but slightly faster for float/double and some compilers (e.g., gcc), thanks to + * specializations when SSE is enabled. + * + * It's usage is justified in performance critical functions, like norm/normalize. + */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sqrt(const T &x) +{ + EIGEN_USING_STD_MATH(sqrt); + return sqrt(x); +} + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 3b8b7303f..0dd52f96e 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -518,6 +518,28 @@ Packet2d prsqrt(const Packet2d& x) { } // end namespace internal +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x))); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC + return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x))); +#else + return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x))); +#endif +} + +} // end namespace numex + } // end namespace Eigen #endif // EIGEN_MATH_FUNCTIONS_SSE_H -- cgit v1.2.3 From 7cae8918c019feabf6c143c430d0cd82c74aeec3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 20:30:32 +0100 Subject: Fix compilation on old gcc+AVX --- Eigen/src/Core/arch/SSE/MathFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 0dd52f96e..74f6abc37 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -524,7 +524,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float &x) { - return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x))); + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); } template<> @@ -532,9 +532,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double &x) { #if EIGEN_COMP_GNUC - return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x))); + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else - return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x))); + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); #endif } -- cgit v1.2.3 From ee37eb4eed09fe35be2acc3699e80f49a44ea99a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 20:43:42 +0100 Subject: bug #977: avoid division by 0 in normalize() and normalized(). --- Eigen/src/Core/Dot.h | 19 ++++++++++++++++--- test/adjoint.cpp | 9 +++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index ce42854cd..221fc3224 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -102,7 +102,10 @@ inline typename NumTraits::Scalar>::Real Matr return numext::sqrt(squaredNorm()); } -/** \returns an expression of the quotient of *this by its own norm. +/** \returns an expression of the quotient of \c *this by its own norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. * * \only_for_vectors * @@ -114,19 +117,29 @@ MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; _Nested n(derived()); - return n / n.norm(); + RealScalar z = n.squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + return n / numext::sqrt(z); + else + return n; } /** Normalizes the vector, i.e. divides it by its own norm. * * \only_for_vectors * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * * \sa norm(), normalized() */ template inline void MatrixBase::normalize() { - *this /= norm(); + RealScalar z = squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + derived() /= numext::sqrt(z); } //---------- implementation of other norms ---------- diff --git a/test/adjoint.cpp b/test/adjoint.cpp index 3b2a53c91..b1e69c2e5 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -42,6 +42,15 @@ template<> struct adjoint_specific { VERIFY_IS_APPROX(v1, v1.norm() * v3); VERIFY_IS_APPROX(v3, v1.normalized()); VERIFY_IS_APPROX(v3.norm(), RealScalar(1)); + + // check null inputs + VERIFY_IS_APPROX((v1*0).normalized(), (v1*0)); + RealScalar very_small = (std::numeric_limits::min)(); + VERIFY( (v1*very_small).norm() == 0 ); + VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small)); + v3 = v1*very_small; + v3.normalize(); + VERIFY_IS_APPROX(v3, (v1*very_small)); // check compatibility of dot and adjoint ref = NumTraits::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm())); -- cgit v1.2.3 From c33479324c2a24094acf78c776655a6474c3bcca Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Jan 2016 17:08:11 -0800 Subject: Fixed a constness bug --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 0524cf0d8..11284315c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -146,7 +146,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } -- cgit v1.2.3 From 06971223efda38924832d86650ee1d4af6984344 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Jan 2016 15:02:21 +0100 Subject: Unify std::numeric_limits and device::numeric_limits within numext namespace --- Eigen/src/Core/NumTraits.h | 18 +++--------------- Eigen/src/Core/util/Meta.h | 6 ++++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index d1aabd995..2ea5eb272 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -71,11 +71,7 @@ template struct GenericNumTraits EIGEN_DEVICE_FUNC static inline Real epsilon() { - #if defined(__CUDA_ARCH__) - return internal::device::numeric_limits::epsilon(); - #else - return std::numeric_limits::epsilon(); - #endif + return numext::numeric_limits::epsilon(); } EIGEN_DEVICE_FUNC static inline Real dummy_precision() @@ -87,20 +83,12 @@ template struct GenericNumTraits EIGEN_DEVICE_FUNC static inline T highest() { -#if defined(__CUDA_ARCH__) - return (internal::device::numeric_limits::max)(); -#else - return (std::numeric_limits::max)(); -#endif + return (numext::numeric_limits::max)(); } EIGEN_DEVICE_FUNC static inline T lowest() { -#if defined(__CUDA_ARCH__) - return IsInteger ? (internal::device::numeric_limits::min)() : (-(internal::device::numeric_limits::max)()); -#else - return IsInteger ? (std::numeric_limits::min)() : (-(std::numeric_limits::max)()); -#endif + return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); } }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3dee2bd7c..617ba0a65 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -375,6 +375,12 @@ template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif +#if defined(__CUDA_ARCH__) +using internal::device::numeric_limits; +#else +using std::numeric_limits; +#endif + // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template -- cgit v1.2.3 From 6a44ccb58b81771cc8438af20e5bf44de3d8c932 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Jan 2016 15:03:53 +0100 Subject: Backout changeset 690bc950f70c61075d396671e63480bbd64bb297 --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 8 ++++---- unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h | 4 ++-- unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h | 2 +- unsupported/test/cxx11_tensor_broadcasting.cpp | 6 +++--- unsupported/test/cxx11_tensor_contraction.cpp | 2 +- unsupported/test/cxx11_tensor_map.cpp | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index c1c57041f..4d99f786c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -404,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { template constexpr inline array h_array_zip(array a, array b, numeric_list) { - return array{ Op::run(array_get(a), array_get(b))... }; + return array{{ Op::run(array_get(a), array_get(b))... }}; } template @@ -432,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array a, array b) -> decl template constexpr inline array h_array_apply(array a, numeric_list) { - return array{ Op::run(array_get(a))... }; + return array{{ Op::run(array_get(a))... }}; } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 092e30c1f..dc6ca4909 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -119,7 +119,7 @@ class Tensor : public TensorBase{firstIndex, secondIndex, otherIndices...}); + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); } #endif @@ -159,7 +159,7 @@ class Tensor : public TensorBase{firstIndex, secondIndex, otherIndices...}); + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); } #endif @@ -199,7 +199,7 @@ class Tensor : public TensorBaseoperator()(array{firstIndex, secondIndex, otherIndices...}); + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); } #else EIGEN_DEVICE_FUNC @@ -266,7 +266,7 @@ class Tensor : public TensorBase{firstIndex, secondIndex, otherIndices...}); + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); } #else EIGEN_DEVICE_FUNC @@ -342,7 +342,7 @@ class Tensor : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(internal::array_prod(array{firstDimension, otherDimensions...}), array{firstDimension, otherDimensions...}) + : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -427,7 +427,7 @@ class Tensor : public TensorBase{firstDimension, otherDimensions...}); + resize(array{{firstDimension, otherDimensions...}}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 06cac3570..52569a359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -289,7 +289,7 @@ struct DSizes : array { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - (*this) = array{firstDimension, otherDimensions...}; + (*this) = array{{firstDimension, otherDimensions...}}; } #else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 7d0858d02..70282dd83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -73,7 +73,7 @@ class TensorFixedSize : public TensorBase{firstIndex, otherIndices...}); + return coeff(array{{firstIndex, otherIndices...}}); } #endif @@ -105,7 +105,7 @@ class TensorFixedSize : public TensorBase{firstIndex, otherIndices...}); + return coeffRef(array{{firstIndex, otherIndices...}}); } #endif @@ -137,7 +137,7 @@ class TensorFixedSize : public TensorBaseoperator()(array{firstIndex, otherIndices...}); + return this->operator()(array{{firstIndex, otherIndices...}}); } #endif @@ -176,7 +176,7 @@ class TensorFixedSize : public TensorBase{firstIndex, otherIndices...}); + return operator()(array{{firstIndex, otherIndices...}}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 7233f4c89..6f69da34a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -141,10 +141,10 @@ template class TensorMap : public Tensor { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } @@ -228,10 +228,10 @@ template class TensorMap : public Tensor static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index 1b9fe2779..bc4f2025f 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -55,7 +55,7 @@ class DynamicSGroup inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{firstIndex, otherIndices...}); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); } template @@ -90,7 +90,7 @@ class DynamicSGroup template inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const { - return std::array{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }; + return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; } template diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 255c344b4..942293bd7 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -217,7 +217,7 @@ class StaticSGroup inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{firstIndex, otherIndices...}); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); } template diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 6fdefd66c..2ddf47234 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting() TensorFixedSize, DataLayout> t2; t2 = t2.constant(20.0f); - Tensor t3 = t1 + t2.broadcast(Eigen::array{10}); + Tensor t3 = t1 + t2.broadcast(Eigen::array{{10}}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); } - TensorMap, DataLayout> > t4(t2.data(), {1}); - Tensor t5 = t1 + t4.broadcast(Eigen::array{10}); + TensorMap, DataLayout> > t4(t2.data(), {{1}}); + Tensor t5 = t1 + t4.broadcast(Eigen::array{{10}}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index c5f3af73e..b0d52c6cf 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -456,7 +456,7 @@ static void test_tensor_product() mat1.setRandom(); mat2.setRandom(); - Tensor result = mat1.contract(mat2, Eigen::array{}); + Tensor result = mat1.contract(mat2, Eigen::array{{}}); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index dc0f8a5a2..a8a095e38 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -130,7 +130,7 @@ static void test_3d() } TensorMap> mat3(mat1.data(), 2, 3, 7); - TensorMap> mat4(mat2.data(), array{2, 3, 7}); + TensorMap> mat4(mat2.data(), array{{2, 3, 7}}); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); -- cgit v1.2.3 From 5358c3858963e03581640e58ea1f3adbdd03b831 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Jan 2016 16:05:29 +0100 Subject: bug #1095: add Cholmod*::logDeterminant/determinant (from patch of Joshua Pritikin) --- Eigen/src/CholmodSupport/CholmodSupport.h | 53 ++++++++++++++++++++++++++++++- test/cholmod_support.cpp | 14 ++++---- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 2da962471..c7c521b95 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -78,7 +78,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) + else if (internal::is_same<_StorageIndex,long>::value) { res.itype = CHOLMOD_LONG; } @@ -327,6 +327,57 @@ class CholmodBase : public SparseSolverBase return derived(); } + /** \returns the determinant of the underlying matrix from the current factorization */ + Scalar determinant() const + { + using std::exp; + return exp(logDeterminant()); + } + + /** \returns the log determinant of the underlying matrix from the current factorization */ + Scalar logDeterminant() const + { + using std::log; + using numext::real; + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()"); + + RealScalar logDet = 0; + Scalar *x = static_cast(m_cholmodFactor->x); + if (m_cholmodFactor->is_super) + { + // Supernodal factorization stored as a packed list of dense column-major blocs, + // as described by the following structure: + + // super[k] == index of the first column of the j-th super node + StorageIndex *super = static_cast(m_cholmodFactor->super); + // pi[k] == offset to the description of row indices + StorageIndex *pi = static_cast(m_cholmodFactor->pi); + // px[k] == offset to the respective dense block + StorageIndex *px = static_cast(m_cholmodFactor->px); + + Index nb_super_nodes = m_cholmodFactor->nsuper; + for (Index k=0; k < nb_super_nodes; ++k) + { + StorageIndex ncols = super[k + 1] - super[k]; + StorageIndex nrows = pi[k + 1] - pi[k]; + + Map, 0, InnerStride<> > sk(x + px[k], ncols, InnerStride<>(nrows+1)); + logDet += sk.real().log().sum(); + } + } + else + { + // Simplicial factorization stored as standard CSC matrix. + StorageIndex *p = static_cast(m_cholmodFactor->p); + Index size = m_cholmodFactor->n; + for (Index k=0; kis_ll) + logDet *= 2.0; + return logDet; + }; + template void dumpMemory(Stream& /*s*/) {} diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp index 87f119b1e..a7eda28f7 100644 --- a/test/cholmod_support.cpp +++ b/test/cholmod_support.cpp @@ -41,13 +41,13 @@ template void test_cholmod_T() check_sparse_spd_solving(llt_colmajor_upper); check_sparse_spd_solving(ldlt_colmajor_lower); check_sparse_spd_solving(ldlt_colmajor_upper); - -// check_sparse_spd_determinant(chol_colmajor_lower); -// check_sparse_spd_determinant(chol_colmajor_upper); -// check_sparse_spd_determinant(llt_colmajor_lower); -// check_sparse_spd_determinant(llt_colmajor_upper); -// check_sparse_spd_determinant(ldlt_colmajor_lower); -// check_sparse_spd_determinant(ldlt_colmajor_upper); + + check_sparse_spd_determinant(chol_colmajor_lower); + check_sparse_spd_determinant(chol_colmajor_upper); + check_sparse_spd_determinant(llt_colmajor_lower); + check_sparse_spd_determinant(llt_colmajor_upper); + check_sparse_spd_determinant(ldlt_colmajor_lower); + check_sparse_spd_determinant(ldlt_colmajor_upper); } void test_cholmod_support() -- cgit v1.2.3 From 4beb447e27baaa19081e835bd6aba76e9b02cc67 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 22 Jan 2016 14:37:26 -0800 Subject: Created a mechanism to enable contraction mappers to determine the best blocking strategy. --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorContractionBlocking.h | 58 ++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 1c5734383..b4f860c41 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -89,6 +89,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionCuda.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 000000000..78ed5038f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,58 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + + +namespace Eigen { +namespace internal { + +enum { + ShardByRow = 0, + ShardByCol = 1 +}; + + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + + typedef typename LhsMapper::Scalar LhsScalar; + typedef typename RhsMapper::Scalar RhsScalar; + + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + kc_(k), mc_(m), nc_(n) + { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } + else { + if (kc_ && mc_ && nc_) { + mc_ = (((m / num_threads) + 15) / 16) * 16; + } + } + } + + EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + + private: + Index kc_; + Index mc_; + Index nc_; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H -- cgit v1.2.3 From 3aeeca32af00b1921b4424d7be2e03bbaeaa05b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 22 Jan 2016 16:36:30 -0800 Subject: Leverage the new blocking code in the tensor contraction code. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +--- unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 5 +++-- unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 624e814e2..e6a008ba7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -582,10 +582,8 @@ struct TensorEvaluator BlockingType; - // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, 1, true); + internal::TensorContractionBlocking blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9b6d18090..63c8ae126 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -426,15 +426,16 @@ class TensorContractionSubMapper { }; -template class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: + typedef Scalar_ Scalar; typedef BaseTensorContractionMapper Base; typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 576bea295..51a3b9490 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -176,10 +176,10 @@ struct TensorEvaluatorm_device.numThreads(); - Index mc = m; - Index nc = n; - Index kc = k; - internal::computeProductBlockingSizes(kc, mc, nc, num_threads); + internal::TensorContractionBlocking blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); -- cgit v1.2.3 From 9f94e030c1c0f334de812cd5220dbb95a0a1e145 Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Fri, 22 Jan 2016 20:08:45 -0700 Subject: Re-add executable flags to minimize changeset. --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/arch/AltiVec/PacketMath.h | 0 Eigen/src/Core/arch/SSE/PacketMath.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 Eigen/src/Core/util/BlasUtil.h | 0 Eigen/src/Core/util/DisableStupidWarnings.h | 0 Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 bench/btl/generic_bench/timers/portable_timer.hh | 0 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 0 22 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 Eigen/src/Core/AssignEvaluator.h mode change 100644 => 100755 Eigen/src/Core/Assign_MKL.h mode change 100644 => 100755 Eigen/src/Core/ProductEvaluators.h mode change 100644 => 100755 Eigen/src/Core/VectorwiseOp.h mode change 100644 => 100755 Eigen/src/Core/arch/AltiVec/PacketMath.h mode change 100644 => 100755 Eigen/src/Core/arch/SSE/PacketMath.h mode change 100644 => 100755 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100644 => 100755 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100644 => 100755 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h mode change 100644 => 100755 Eigen/src/Core/util/BlasUtil.h mode change 100644 => 100755 Eigen/src/Core/util/DisableStupidWarnings.h mode change 100644 => 100755 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealQZ.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100644 => 100755 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100644 => 100755 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100644 => 100755 Eigen/src/SVD/JacobiSVD.h mode change 100644 => 100755 Eigen/src/SparseLU/SparseLU.h mode change 100644 => 100755 bench/btl/generic_bench/timers/portable_timer.hh mode change 100644 => 100755 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100644 new mode 100755 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100644 new mode 100755 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100644 new mode 100755 diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh old mode 100644 new mode 100755 diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h old mode 100644 new mode 100755 -- cgit v1.2.3 From 0caa4b1531def27bde0ffeb942cc10f9917a47c6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:13:54 +0100 Subject: bug #1150: make IncompleteCholesky more robust by iteratively increase the shift until the factorization succeed (with at most 10 attempts). --- .../IterativeLinearSolvers/IncompleteCholesky.h | 178 ++++++++++++--------- test/incomplete_cholesky.cpp | 30 +++- 2 files changed, 131 insertions(+), 77 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 18e9d1466..babc14d9e 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -240,7 +240,7 @@ void IncompleteCholesky::factorize(const _MatrixType else m_scale(j) = 1; - // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) + // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) // Scale and compute the shift for the matrix RealScalar mindiag = NumTraits::highest(); @@ -251,96 +251,122 @@ void IncompleteCholesky::factorize(const _MatrixType eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored"); mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag); } + + FactorType L_save = m_L; RealScalar shift = 0; if(mindiag <= RealScalar(0.)) shift = m_initialShift - mindiag; - // Apply the shift to the diagonal elements of the matrix - for (Index j = 0; j < n; j++) - vals[colPtr[j]] += shift; - - // jki version of the Cholesky factorization - for (Index j=0; j < n; ++j) - { - // Left-looking factorization of the j-th column - // First, load the j-th column into col_vals - Scalar diag = vals[colPtr[j]]; // It is assumed that only the lower part is stored - col_nnz = 0; - for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++) - { - StorageIndex l = rowIdx[i]; - col_vals(col_nnz) = vals[i]; - col_irow(col_nnz) = l; - col_pattern(l) = col_nnz; - col_nnz++; - } + m_info = NumericalIssue; + + // Try to perform the incomplete factorization using the current shift + int iter = 0; + do + { + // Apply the shift to the diagonal elements of the matrix + for (Index j = 0; j < n; j++) + vals[colPtr[j]] += shift; + + // jki version of the Cholesky factorization + Index j=0; + for (; j < n; ++j) { - typename std::list::iterator k; - // Browse all previous columns that will update column j - for(k = listCol[j].begin(); k != listCol[j].end(); k++) + // Left-looking factorization of the j-th column + // First, load the j-th column into col_vals + Scalar diag = vals[colPtr[j]]; // It is assumed that only the lower part is stored + col_nnz = 0; + for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++) { - Index jk = firstElt(*k); // First element to use in the column - eigen_internal_assert(rowIdx[jk]==j); - Scalar v_j_jk = numext::conj(vals[jk]); - - jk += 1; - for (Index i = jk; i < colPtr[*k+1]; i++) + StorageIndex l = rowIdx[i]; + col_vals(col_nnz) = vals[i]; + col_irow(col_nnz) = l; + col_pattern(l) = col_nnz; + col_nnz++; + } + { + typename std::list::iterator k; + // Browse all previous columns that will update column j + for(k = listCol[j].begin(); k != listCol[j].end(); k++) { - StorageIndex l = rowIdx[i]; - if(col_pattern[l]<0) + Index jk = firstElt(*k); // First element to use in the column + eigen_internal_assert(rowIdx[jk]==j); + Scalar v_j_jk = numext::conj(vals[jk]); + + jk += 1; + for (Index i = jk; i < colPtr[*k+1]; i++) { - col_vals(col_nnz) = vals[i] * v_j_jk; - col_irow[col_nnz] = l; - col_pattern(l) = col_nnz; - col_nnz++; + StorageIndex l = rowIdx[i]; + if(col_pattern[l]<0) + { + col_vals(col_nnz) = vals[i] * v_j_jk; + col_irow[col_nnz] = l; + col_pattern(l) = col_nnz; + col_nnz++; + } + else + col_vals(col_pattern[l]) -= vals[i] * v_j_jk; } - else - col_vals(col_pattern[l]) -= vals[i] * v_j_jk; + updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol); } - updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol); } + + // Scale the current column + if(numext::real(diag) <= 0) + { + if(++iter>=10) + return; + + // increase shift + shift = numext::maxi(m_initialShift,RealScalar(2)*shift); + // restore m_L, col_pattern, and listCol + vals = Map(L_save.valuePtr(), nnz); + rowIdx = Map(L_save.innerIndexPtr(), nnz); + colPtr = Map(L_save.outerIndexPtr(), n+1); + col_pattern.fill(-1); + for(Index i=0; i cvals = col_vals.head(col_nnz); + Ref cirow = col_irow.head(col_nnz); + internal::QuickSplit(cvals,cirow, p); + // Insert the largest p elements in the matrix + Index cpt = 0; + for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++) + { + vals[i] = col_vals(cpt); + rowIdx[i] = col_irow(cpt); + // restore col_pattern: + col_pattern(col_irow(cpt)) = -1; + cpt++; + } + // Get the first smallest row index and put it after the diagonal element + Index jk = colPtr(j)+1; + updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); } - - // Scale the current column - if(numext::real(diag) <= 0) - { - m_info = NumericalIssue; - return; - } - - RealScalar rdiag = sqrt(numext::real(diag)); - vals[colPtr[j]] = rdiag; - for (Index k = 0; k cvals = col_vals.head(col_nnz); - Ref cirow = col_irow.head(col_nnz); - internal::QuickSplit(cvals,cirow, p); - // Insert the largest p elements in the matrix - Index cpt = 0; - for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++) + + if(j==n) { - vals[i] = col_vals(cpt); - rowIdx[i] = col_irow(cpt); - // restore col_pattern: - col_pattern(col_irow(cpt)) = -1; - cpt++; + m_factorizationIsOk = true; + m_info = Success; } - // Get the first smallest row index and put it after the diagonal element - Index jk = colPtr(j)+1; - updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); - } - m_factorizationIsOk = true; - m_info = Success; + } while(m_info!=Success); } template diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 7acad9872..59ffe9259 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2015 Gael Guennebaud +// Copyright (C) 2015-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -34,4 +34,32 @@ void test_incomplete_cholesky() CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); + +#ifdef EIGEN_TEST_PART_1 + // regression for bug 1150 + for(int N = 1; N<20; ++N) + { + Eigen::MatrixXd b( N, N ); + b.setOnes(); + + Eigen::SparseMatrix m( N, N ); + m.reserve(Eigen::VectorXi::Constant(N,4)); + for( int i = 0; i < N; ++i ) + { + m.insert( i, i ) = 1; + m.coeffRef( i, i / 2 ) = 2; + m.coeffRef( i, i / 3 ) = 2; + m.coeffRef( i, i / 4 ) = 2; + } + + Eigen::SparseMatrix A; + A = m * m.transpose(); + + Eigen::ConjugateGradient, + Eigen::Lower | Eigen::Upper, + Eigen::IncompleteCholesky > solver( A ); + VERIFY(solver.preconditioner().info() == Eigen::Success); + VERIFY(solver.info() == Eigen::Success); + } +#endif } -- cgit v1.2.3 From 369d6d1ae31c3e1a0f03196ccb9c792c6913ed76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:16:03 +0100 Subject: Add link to reference paper. --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index babc14d9e..adf3686b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -185,6 +185,10 @@ class IncompleteCholesky : public SparseSolverBase colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); }; +// Based on the following paper: +// C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with +// Limited memory, SIAM J. Sci. Comput. 21(1), pp. 24-45, 1999 +// http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf template template void IncompleteCholesky::factorize(const _MatrixType& mat) @@ -316,7 +320,7 @@ void IncompleteCholesky::factorize(const _MatrixType { if(++iter>=10) return; - + // increase shift shift = numext::maxi(m_initialShift,RealScalar(2)*shift); // restore m_L, col_pattern, and listCol -- cgit v1.2.3 From 1cf85bd875ecbcfa1240b4ec08122d40d79101fd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:40:11 +0100 Subject: bug #977: add stableNormalize[d] methods: they are analogues to normalize[d] but with carefull handling of under/over-flow --- Eigen/src/Core/Dot.h | 46 +++++++++++++++++++++++++++++++++++++++++++++ Eigen/src/Core/MatrixBase.h | 2 ++ test/stable_norm.cpp | 14 ++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 221fc3224..82d58fc0b 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -142,6 +142,52 @@ inline void MatrixBase::normalize() derived() /= numext::sqrt(z); } +/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow. + * + * \only_for_vectors + * + * This method is analogue to the normalized() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. + * + * \sa stableNorm(), stableNormalize(), normalized() + */ +template +inline const typename MatrixBase::PlainObject +MatrixBase::stableNormalized() const +{ + typedef typename internal::nested_eval::type _Nested; + _Nested n(derived()); + RealScalar w = n.cwiseAbs().maxCoeff(); + RealScalar z = (n/w).squaredNorm(); + if(z>RealScalar(0)) + return n / (numext::sqrt(z)*w); + else + return n; +} + +/** Normalizes the vector while avoid underflow and overflow + * + * \only_for_vectors + * + * This method is analogue to the normalize() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * + * \sa stableNorm(), stableNormalized(), normalize() + */ +template +inline void MatrixBase::stableNormalize() +{ + RealScalar w = cwiseAbs().maxCoeff(); + RealScalar z = (derived()/w).squaredNorm(); + if(z>RealScalar(0)) + derived() /= numext::sqrt(z)*w; +} + //---------- implementation of other norms ---------- namespace internal { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index f3935802d..338879c73 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -204,7 +204,9 @@ template class MatrixBase RealScalar blueNorm() const; RealScalar hypotNorm() const; EIGEN_DEVICE_FUNC const PlainObject normalized() const; + EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const; EIGEN_DEVICE_FUNC void normalize(); + EIGEN_DEVICE_FUNC void stableNormalize(); EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const; EIGEN_DEVICE_FUNC void adjointInPlace(); diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 7561ae8be..9f12320e0 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -163,6 +163,20 @@ template void stable_norm(const MatrixType& m) VERIFY(!(numext::isfinite)(v.blueNorm())); VERIFY((numext::isnan)(v.blueNorm())); VERIFY(!(numext::isfinite)(v.hypotNorm())); VERIFY((numext::isnan)(v.hypotNorm())); } + + // stableNormalize[d] + { + VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized()); + MatrixType vcopy(vrand); + vcopy.stableNormalize(); + VERIFY_IS_APPROX(vcopy, vrand.normalized()); + VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1)); + VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX(vbig, vbig.stableNorm() * vbig.stableNormalized()); + VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized()); + } } void test_stable_norm() -- cgit v1.2.3 From 19e437daf09e7c449bb644a072e6caad261f5d36 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Sun, 24 Jan 2016 15:50:36 +0100 Subject: Copyedit documentation: typos, spelling --- doc/TutorialReductionsVisitorsBroadcasting.dox | 14 ++++++-------- doc/UsingIntelMKL.dox | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox index 6d25ff0ea..f5322b4a6 100644 --- a/doc/TutorialReductionsVisitorsBroadcasting.dox +++ b/doc/TutorialReductionsVisitorsBroadcasting.dox @@ -101,17 +101,16 @@ row and column position are to be stored. These variables should be of type \verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out -Note that both functions also return the value of the minimum or maximum coefficient if needed, -as if it was a typical reduction operation. +Both functions also return the value of the minimum or maximum coefficient. \section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions Partial reductions are reductions that can operate column- or row-wise on a Matrix or Array, applying the reduction operation on each column or row and -returning a column or row-vector with the corresponding values. Partial reductions are applied +returning a column or row vector with the corresponding values. Partial reductions are applied with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink. A simple example is obtaining the maximum of the elements -in each column in a given matrix, storing the result in a row-vector: +in each column in a given matrix, storing the result in a row vector: @@ -133,8 +132,7 @@ The same operation can be performed row-wise: \verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
Example:Output:
-Note that column-wise operations return a 'row-vector' while row-wise operations -return a 'column-vector' +Note that column-wise operations return a row vector, while row-wise operations return a column vector. \subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations It is also possible to use the result of a partial reduction to do further processing. @@ -176,7 +174,7 @@ The concept behind broadcasting is similar to partial reductions, with the diffe constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in one direction. -A simple example is to add a certain column-vector to each column in a matrix. +A simple example is to add a certain column vector to each column in a matrix. This can be accomplished with: @@ -253,7 +251,7 @@ is a new matrix whose size is the same as matrix m: \f[ \f] - (m.colwise() - v).colwise().squaredNorm() is a partial reduction, computing the squared norm column-wise. The result of -this operation is a row-vector where each coefficient is the squared Euclidean distance between each column in m and v: \f[ +this operation is a row vector where each coefficient is the squared Euclidean distance between each column in m and v: \f[ \mbox{(m.colwise() - v).colwise().squaredNorm()} = \begin{bmatrix} 1 & 505 & 32 & 50 diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 84db992b6..02c62ad85 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -52,7 +52,7 @@ When doing so, a number of Eigen's algorithms are silently substituted with call These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex, and \c complex. Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms. -In addition you can coarsely select choose which parts will be substituted by defining one or multiple of the following macros: +In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
-- cgit v1.2.3 From cc482e32f1b6026d965aaa6b5bcab63a96701049 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Sun, 24 Jan 2016 15:50:59 +0100 Subject: Method is called visit, not visitor --- Eigen/src/Core/Visitor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 7aac0b6e1..d71dfc968 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -197,7 +197,7 @@ struct functor_traits > { /** \returns the minimum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template template @@ -215,7 +215,7 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const /** \returns the minimum of all coefficients of *this and puts in *index its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template template @@ -233,7 +233,7 @@ DenseBase::minCoeff(IndexType* index) const /** \returns the maximum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template template -- cgit v1.2.3 From bd207ce11e8133874d5a12573921ea93874a0f9e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 24 Jan 2016 20:36:05 -0800 Subject: Added missing EIGEN_DEVICE_FUNC qualifier --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index e7daf7304..bd83d5de8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_buffer + index); } -- cgit v1.2.3 From e3a15a03a4fe758ed0a00f3a2b083d7ca58ca16b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 24 Jan 2016 23:04:50 -0800 Subject: Don't explicitely evaluate the subexpression from TensorForcedEval::evalSubExprIfNeeded, as it will be done when executing the EvalTo subexpression --- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c9b0b2f28..58b864787 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -106,7 +106,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); const Index numValues = m_impl.dimensions().TotalSize(); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. @@ -119,7 +118,6 @@ struct TensorEvaluator, Device> EvalTo evalToTmp(m_buffer, m_op); const bool PacketAccess = internal::IsVectorizable::value; internal::TensorExecutor::run(evalToTmp, m_device); - m_impl.cleanup(); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { -- cgit v1.2.3 From 869b4443ac4a55c09a0632e2dbf621587749e164 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 11:55:39 +0100 Subject: Add SparseVector::conservativeResize() method. --- Eigen/src/SparseCore/SparseVector.h | 18 +++++++++++++ test/sparse_vector.cpp | 54 +++++++++++++++++++++++++++++++------ 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 7ec73a365..4db3003d2 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -222,6 +222,24 @@ class SparseVector m_data.clear(); } + /** Resizes the sparse vector to \a newSize, while leaving old values untouched. + * + * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved. + * Call .data().squeeze() to free extra memory. + * + * \sa reserve(), setZero() + */ + void conservativeResize(Index newSize) + { + if (newSize < m_size) + { + Index i = 0; + while (i void sparse_vector(int rows, int cols) +template void sparse_vector(int rows, int cols) { double densityMat = (std::max)(8./(rows*cols), 0.01); double densityVec = (std::max)(8./float(rows), 0.1); typedef Matrix DenseMatrix; typedef Matrix DenseVector; - typedef SparseVector SparseVectorType; - typedef SparseMatrix SparseMatrixType; + typedef SparseVector SparseVectorType; + typedef SparseMatrix SparseMatrixType; Scalar eps = 1e-6; SparseMatrixType m1(rows,rows); @@ -87,8 +87,10 @@ template void sparse_vector(int rows, int cols) VERIFY_IS_APPROX(m1*v2, refM1*refV2); VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2)); - int i = internal::random(0,rows-1); - VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i))); + { + int i = internal::random(0,rows-1); + VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i))); + } VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm()); @@ -111,15 +113,51 @@ template void sparse_vector(int rows, int cols) VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); + // test conservative resize + { + std::vector inc; + if(rows > 3) + inc.push_back(-3); + inc.push_back(0); + inc.push_back(3); + inc.push_back(1); + inc.push_back(10); + + for(std::size_t i = 0; i< inc.size(); i++) { + StorageIndex incRows = inc[i]; + SparseVectorType vec1(rows); + DenseVector refVec1 = DenseVector::Zero(rows); + initSparse(densityVec, refVec1, vec1); + + vec1.conservativeResize(rows+incRows); + refVec1.conservativeResize(rows+incRows); + if (incRows > 0) refVec1.tail(incRows).setZero(); + + VERIFY_IS_APPROX(vec1, refVec1); + + // Insert new values + if (incRows > 0) + vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1; + + VERIFY_IS_APPROX(vec1, refVec1); + } + } + } void test_sparse_vector() { for(int i = 0; i < g_repeat; i++) { + int r = Eigen::internal::random(1,500), c = Eigen::internal::random(1,500); + if(Eigen::internal::random(0,4) == 0) { + r = c; // check square matrices in 25% of tries + } + EIGEN_UNUSED_VARIABLE(r+c); + CALL_SUBTEST_1(( sparse_vector(8, 8) )); - CALL_SUBTEST_2(( sparse_vector, int>(16, 16) )); - CALL_SUBTEST_1(( sparse_vector(299, 535) )); - CALL_SUBTEST_1(( sparse_vector(299, 535) )); + CALL_SUBTEST_2(( sparse_vector, int>(r, c) )); + CALL_SUBTEST_1(( sparse_vector(r, c) )); + CALL_SUBTEST_1(( sparse_vector(r, c) )); } } -- cgit v1.2.3 From b114e6fd3b61c7ef93f6b94e194d316f0ab19036 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 11:56:25 +0100 Subject: Improve documentation. --- Eigen/src/SparseCore/SparseMatrix.h | 7 ++++++- Eigen/src/SparseCore/SparseVector.h | 13 +++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 91bada40f..8b57445e6 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -538,7 +538,12 @@ class SparseMatrix } /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched. - * \sa reserve(), setZero() + * + * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode + * and the storage of the out of bounds coefficients is kept and reserved. + * Call makeCompressed() to pack the entries and squeeze extra memory. + * + * \sa reserve(), setZero(), makeCompressed() */ void conservativeResize(Index rows, Index cols) { diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 4db3003d2..40a754300 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -205,17 +205,30 @@ class SparseVector inline void finalize() {} + /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { m_data.prune(reference,epsilon); } + /** Resizes the sparse vector to \a rows x \a cols + * + * This method is provided for compatibility with matrices. + * For a column vector, \a cols must be equal to 1. + * For a row vector, \a rows must be equal to 1. + * + * \sa resize(Index) + */ void resize(Index rows, Index cols) { eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1"); resize(IsColVector ? rows : cols); } + /** Resizes the sparse vector to \a newSize + * This method deletes all entries, thus leaving an empty sparse vector + * + * \sa conservativeResize(), setZero() */ void resize(Index newSize) { m_size = newSize; -- cgit v1.2.3 From c10021c00a6cb6033bc479a46aef058c48836efd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 15:50:55 +0100 Subject: bug #1144: clarify the doc about aliasing in case of resizing and matrix product. --- doc/TopicAliasing.dox | 30 ++++++++++++++++++++++++++---- doc/snippets/TopicAliasing_mult4.cpp | 5 +++++ doc/snippets/TopicAliasing_mult5.cpp | 5 +++++ 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 doc/snippets/TopicAliasing_mult4.cpp create mode 100644 doc/snippets/TopicAliasing_mult5.cpp diff --git a/doc/TopicAliasing.dox b/doc/TopicAliasing.dox index c2654aed2..a8f164428 100644 --- a/doc/TopicAliasing.dox +++ b/doc/TopicAliasing.dox @@ -153,10 +153,11 @@ not necessary to evaluate the right-hand side explicitly. \section TopicAliasingMatrixMult Aliasing and matrix multiplication -Matrix multiplication is the only operation in %Eigen that assumes aliasing by default. Thus, if \c matA is a -matrix, then the statement matA = matA * matA; is safe. All other operations in %Eigen assume that -there are no aliasing problems, either because the result is assigned to a different matrix or because it is a -component-wise operation. +Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, under the +condition that the destination matrix is not resized. +Thus, if \c matA is a \b squared matrix, then the statement matA = matA * matA; is safe. +All other operations in %Eigen assume that there are no aliasing problems, +either because the result is assigned to a different matrix or because it is a component-wise operation.
\c EIGEN_USE_BLAS Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)
@@ -198,6 +199,27 @@ may get wrong results: \verbinclude TopicAliasing_mult3.out
ExampleOutput
+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination. +Therefore, the following example is also wrong: + + + + +
ExampleOutput
+\include TopicAliasing_mult4.cpp + +\verbinclude TopicAliasing_mult4.out +
+ +As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment: + + + +
ExampleOutput
+\include TopicAliasing_mult5.cpp + +\verbinclude TopicAliasing_mult5.out +
\section TopicAliasingSummary Summary diff --git a/doc/snippets/TopicAliasing_mult4.cpp b/doc/snippets/TopicAliasing_mult4.cpp new file mode 100644 index 000000000..8a8992f6c --- /dev/null +++ b/doc/snippets/TopicAliasing_mult4.cpp @@ -0,0 +1,5 @@ +MatrixXf A(2,2), B(3,2); +B << 2, 0, 0, 3, 1, 1; +A << 2, 0, 0, -2; +A = (B * A).cwiseAbs(); +cout << A; \ No newline at end of file diff --git a/doc/snippets/TopicAliasing_mult5.cpp b/doc/snippets/TopicAliasing_mult5.cpp new file mode 100644 index 000000000..1a36defde --- /dev/null +++ b/doc/snippets/TopicAliasing_mult5.cpp @@ -0,0 +1,5 @@ +MatrixXf A(2,2), B(3,2); +B << 2, 0, 0, 3, 1, 1; +A << 2, 0, 0, -2; +A = (B * A).eval().cwiseAbs(); +cout << A; -- cgit v1.2.3 From e58827d2ed32cee5362e4d7d007da06a2bdc7309 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 17:16:33 +0100 Subject: bug #51: make general_matrix_matrix_triangular_product use L3-blocking helper so that general symmetric rank-updates and general-matrix-to-triangular products do not trigger dynamic memory allocation for fixed size matrices. --- .../Core/products/GeneralMatrixMatrixTriangular.h | 44 +++++++++++++++++----- Eigen/src/Core/products/SelfadjointProduct.h | 24 +++++++++--- test/nomalloc.cpp | 9 +++-- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index a36eb2fe0..f80f3b410 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -42,13 +42,14 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); } }; @@ -58,7 +59,8 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; @@ -73,12 +75,20 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc, 1); + + kc = blocking.kc(); + mc = (std::min)(size,blocking.mc()); + nc = (std::min)(size,blocking.nc()); + // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -256,13 +266,27 @@ struct general_product_to_triangular_selector typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + enum { + IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, + LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualLhs.cols(); + + typedef internal::gemm_blocking_space BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product - ::run(mat.cols(), actualLhs.cols(), + typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 2af00058d..f038d686f 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -92,15 +92,27 @@ struct selfadjoint_product_selector Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); - enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0 }; + enum { + IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, + OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualOther.cols(); + + typedef internal::gemm_blocking_space BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product::IsComplex, - Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualOther.cols(), + Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits::IsComplex, + Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 060276a20..d85e9e5bc 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -81,11 +81,12 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1.row(0),-1); // The following fancy matrix-matrix products are not safe yet regarding static allocation -// m1 += m1.template triangularView() * m2.col(; -// m1.template selfadjointView().rankUpdate(m2); -// m1 += m1.template triangularView() * m2; +// m1.col(1) += m1.template triangularView() * m2.col(0); + m2.template selfadjointView().rankUpdate(m1); + m2 += m2.template triangularView() * m1; + m2.template triangularView() = m2 * m2; // m1 += m1.template selfadjointView() * m2; -// VERIFY_IS_APPROX(m1,m1); + VERIFY_IS_APPROX(m2,m2); } template -- cgit v1.2.3 From 2f9e6314b183d333ad75ec578815073ed9fb390e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 21:56:05 +0100 Subject: update BLAS interface to general_matrix_matrix_triangular_product --- blas/level3_impl.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 6a6b00728..835e53680 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -315,7 +315,7 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp { // std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; #if !ISCOMPLEX - typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&); + typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); static functype func[8]; static bool init = false; @@ -381,8 +381,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp matrix(c, *n, *n, *ldc).triangularView() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda); } #else + internal::gemm_blocking_space blocking(*n,*n,*k,1,false); + int code = OP(*op) | (UPLO(*uplo) << 2); - func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha); + func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking); #endif return 0; @@ -516,7 +518,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp { // std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; - typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&); + typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); static functype func[8]; static bool init = false; @@ -571,7 +573,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp if(*k>0 && alpha!=RealScalar(0)) { - func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha); + internal::gemm_blocking_space blocking(*n,*n,*k,1,false); + func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking); matrix(c, *n, *n, *ldc).diagonal().imag().setZero(); } return 0; -- cgit v1.2.3 From 8328caa618731fb2a5802daaf8088db4175567a2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 22:06:42 +0100 Subject: bug #51: add block preallocation mechanism to selfadjoit*matrix product. --- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 46 ++++++++++++----------- blas/level3_impl.h | 23 +++++++----- test/nomalloc.cpp | 2 +- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index f84f54982..ba8ee1d53 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -291,7 +291,7 @@ struct product_selfadjoint_matrix& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -314,7 +314,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = rows; @@ -340,17 +340,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); - // kc must smaller than mc + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc kc = (std::min)(kc,mc); + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; @@ -410,7 +409,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = cols; @@ -432,14 +431,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; @@ -498,6 +495,11 @@ struct selfadjoint_product_impl Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); + internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), @@ -509,7 +511,7 @@ struct selfadjoint_product_impl &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + actualAlpha, blocking // alpha ); } }; diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 835e53680..b2772b190 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -275,9 +275,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); #if ISCOMPLEX // FIXME add support for symmetric complex matrix - int size = (SIDE(*side)==LEFT) ? (*m) : (*n); Matrix matA(size,size); if(UPLO(*uplo)==UP) { @@ -294,13 +294,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa else if(SIDE(*side)==RIGHT) matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA; #else + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; else if(SIDE(*side)==RIGHT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; else return 0; @@ -488,20 +490,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) { if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; } else if(SIDE(*side)==RIGHT) { if(UPLO(*uplo)==UP) matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView();/*internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/ + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/ else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; } else diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index d85e9e5bc..077ecae55 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -85,7 +85,7 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1); m2 += m2.template triangularView() * m1; m2.template triangularView() = m2 * m2; -// m1 += m1.template selfadjointView() * m2; + m1 += m1.template selfadjointView() * m2; VERIFY_IS_APPROX(m2,m2); } -- cgit v1.2.3 From 5eb2790be093cc23d4c4808b5c53d79eba22ecb0 Mon Sep 17 00:00:00 2001 From: Hauke Heibel Date: Mon, 25 Jan 2016 22:17:52 +0100 Subject: Fixed minor typo in SplineFitting. --- unsupported/Eigen/src/Splines/SplineFitting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h index d3c245fa9..8e6a5aaed 100644 --- a/unsupported/Eigen/src/Splines/SplineFitting.h +++ b/unsupported/Eigen/src/Splines/SplineFitting.h @@ -167,7 +167,7 @@ namespace Eigen derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(), temporaryKnots.data()); - // Number of control points (one for each point and derivative) plus spline order. + // Number of knots (one for each point and derivative) plus spline order. DenseIndex numKnots = numParameters + numDerivatives + degree + 1; knots.resize(numKnots); -- cgit v1.2.3 From 44d4674955213013e7db7871c99d7968b8555075 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 26 Jan 2016 16:45:33 +0100 Subject: bug #1153: Don't rely on __GXX_EXPERIMENTAL_CXX0X__ to detect C++11 support --- Eigen/src/Core/util/Macros.h | 1 - Eigen/src/Core/util/StaticAssert.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9b4f8faa7..cf6b03ec7 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -336,7 +336,6 @@ // Do we support r-value references? #if (__has_feature(cxx_rvalue_references) || \ (defined(__cplusplus) && __cplusplus >= 201103L) || \ - defined(__GXX_EXPERIMENTAL_CXX0X__) || \ (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAVE_RVALUE_REFERENCES #endif diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index e174509e0..afae2e51e 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -26,7 +26,7 @@ #ifndef EIGEN_NO_STATIC_ASSERT - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600) + #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); -- cgit v1.2.3 From 639b1d864a3d8e3cd5f2f060af9e1a5fdae27008 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 26 Jan 2016 11:44:16 -0500 Subject: bug #1152: Fix data race in static initialization of blas --- blas/level2_cplx_impl.h | 95 +++++---------- blas/level2_impl.h | 303 ++++++++++++++++++++++++----------------------- blas/level2_real_impl.h | 123 +++++-------------- blas/level3_impl.h | 305 +++++++++++++++++++++++++++--------------------- 4 files changed, 388 insertions(+), 438 deletions(-) diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index 9b845de22..2edc51596 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -19,19 +19,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) { typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_matrix_vector_product::run); - func[LO] = (internal::selfadjoint_matrix_vector_product::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_matrix_vector_product::run), + // array index: LO + (internal::selfadjoint_matrix_vector_product::run), + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -111,19 +104,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_packed_rank1_update::run); - func[LO] = (internal::selfadjoint_packed_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_packed_rank1_update::run), + // array index: LO + (internal::selfadjoint_packed_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* ap = reinterpret_cast(pap); @@ -162,19 +148,12 @@ int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::packed_rank2_update_selector::run); - func[LO] = (internal::packed_rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::packed_rank2_update_selector::run), + // array index: LO + (internal::packed_rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); @@ -217,19 +196,12 @@ int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (selfadjoint_rank1_update::run); - func[LO] = (selfadjoint_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (selfadjoint_rank1_update::run), + // array index: LO + (selfadjoint_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* a = reinterpret_cast(pa); @@ -271,19 +243,12 @@ int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::rank2_update_selector::run); - func[LO] = (internal::rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::rank2_update_selector::run), + // array index: LO + (internal::rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); diff --git a/blas/level2_impl.h b/blas/level2_impl.h index 917f2e372..d09db0cc6 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -26,20 +26,15 @@ struct general_matrix_vector_product_wrapper int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar); - static functype func[4]; - - static bool init = false; - if(!init) - { - for(int k=0; k<4; ++k) - func[k] = 0; - - func[NOTR] = (general_matrix_vector_product_wrapper::run); - func[TR ] = (general_matrix_vector_product_wrapper::run); - func[ADJ ] = (general_matrix_vector_product_wrapper::run); - - init = true; - } + static const functype func[4] = { + // array index: NOTR + (general_matrix_vector_product_wrapper::run), + // array index: TR + (general_matrix_vector_product_wrapper::run), + // array index: ADJ + (general_matrix_vector_product_wrapper::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -90,32 +85,36 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) { typedef void (*functype)(int, const Scalar *, int, Scalar *); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -145,32 +144,36 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -346,32 +349,36 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) { typedef void (*functype)(int, int, const Scalar *, int, Scalar *); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int i=0; i<16; ++i) - func[i] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -416,32 +423,36 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx) { typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0 + }; Scalar* ap = reinterpret_cast(pap); Scalar* x = reinterpret_cast(px); @@ -487,32 +498,36 @@ int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx) { typedef void (*functype)(int, const Scalar*, Scalar*); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0 + }; Scalar* ap = reinterpret_cast(pap); Scalar* x = reinterpret_cast(px); diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h index cac89b268..4896a03d9 100644 --- a/blas/level2_real_impl.h +++ b/blas/level2_real_impl.h @@ -13,19 +13,12 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) { typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_matrix_vector_product::run); - func[LO] = (internal::selfadjoint_matrix_vector_product::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_matrix_vector_product::run), + // array index: LO + (internal::selfadjoint_matrix_vector_product::run), + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -71,34 +64,13 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc) { -// typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar); -// static functype func[2]; - -// static bool init = false; -// if(!init) -// { -// for(int k=0; k<2; ++k) -// func[k] = 0; -// -// func[UP] = (internal::selfadjoint_product::run); -// func[LO] = (internal::selfadjoint_product::run); - -// init = true; -// } typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (selfadjoint_rank1_update::run); - func[LO] = (selfadjoint_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (selfadjoint_rank1_update::run), + // array index: LO + (selfadjoint_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* c = reinterpret_cast(pc); @@ -131,34 +103,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, // C := alpha*x*y' + alpha*y*x' + C int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc) { -// typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar); -// static functype func[2]; -// -// static bool init = false; -// if(!init) -// { -// for(int k=0; k<2; ++k) -// func[k] = 0; -// -// func[UP] = (internal::selfadjoint_product::run); -// func[LO] = (internal::selfadjoint_product::run); -// -// init = true; -// } typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::rank2_update_selector::run); - func[LO] = (internal::rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::rank2_update_selector::run), + // array index: LO + (internal::rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); @@ -234,19 +185,12 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_packed_rank1_update::run); - func[LO] = (internal::selfadjoint_packed_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_packed_rank1_update::run), + // array index: LO + (internal::selfadjoint_packed_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* ap = reinterpret_cast(pap); @@ -285,19 +229,12 @@ int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *in int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::packed_rank2_update_selector::run); - func[LO] = (internal::packed_rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::packed_rank2_update_selector::run), + // array index: LO + (internal::packed_rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index b2772b190..267a727ef 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -13,24 +13,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal { // std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking&, Eigen::internal::GemmParallelInfo*); - static functype func[12]; - - static bool init = false; - if(!init) - { - for(int i=0; i<12; ++i) - func[i] = 0; - func[NOTR | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[NOTR | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[NOTR | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - init = true; - } + static const functype func[12] = { + // array index: NOTR | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + 0, + // array index: NOTR | (TR << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (TR << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (TR << 2) + (internal::general_matrix_matrix_product::run), + 0, + // array index: NOTR | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -73,49 +78,64 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, { // std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n"; typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking&); - static functype func[32]; - - static bool init = false; - if(!init) - { - for(int i=0; i<32; ++i) - func[i] = 0; - - func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - - func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - init = true; - } + static const functype func[32] = { + // array index: NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run),\ + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -162,47 +182,64 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, { // std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[32]; - static bool init = false; - if(!init) - { - for(int k=0; k<32; ++k) - func[k] = 0; - - func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - init = true; - } + static const functype func[32] = { + // array index: NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -318,24 +355,22 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; #if !ISCOMPLEX typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[8]; - - static bool init = false; - if(!init) - { - for(int i=0; i<8; ++i) - func[i] = 0; - - func[NOTR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[TR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - func[NOTR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[TR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - init = true; - } + static const functype func[8] = { + // array index: NOTR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: TR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: ADJ | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: NOTR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: TR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: ADJ | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0 + }; #endif Scalar* a = reinterpret_cast(pa); @@ -524,22 +559,20 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[8]; - - static bool init = false; - if(!init) - { - for(int i=0; i<8; ++i) - func[i] = 0; - - func[NOTR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - func[NOTR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - init = true; - } + static const functype func[8] = { + // array index: NOTR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: ADJ | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: NOTR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: ADJ | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* c = reinterpret_cast(pc); -- cgit v1.2.3 From 5b0a9ee0032bbdbf6c8f26e788e9b992c9532432 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:30:24 +0100 Subject: Make sure that block sizes are smaller than input matrix sizes. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 229e96ceb..d2e6f26c8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -252,7 +252,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = 576; + max_mc = (std::min)(576,max_mc); } Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; -- cgit v1.2.3 From aa8c6a251e0c988aa4d8f6914d2052a9d2c9cbc2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:31:48 +0100 Subject: Make sure that micro-panel-size is smaller than blocking sizes (otherwise we might get a buffer overflow) --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 39ab87df8..8a2f7cd78 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -126,6 +126,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix(actual_kc-k1, SmallPanelWidth); + Index actualPanelWidth = std::min(actual_kc-k1, panelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; -- cgit v1.2.3 From 6850eab33b71556f62c29ff29bbcdf0240cfd8e2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:32:48 +0100 Subject: Re-enable blocking on rows in non-l3 blocking mode. --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d77fc2630..a39c7808c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -352,9 +352,8 @@ class gemm_blocking_spacem_mc; Index n = this->m_nc; - computeProductBlockingSizes(this->m_kc, m, n, num_threads); + computeProductBlockingSizes(this->m_kc, this->m_mc, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; -- cgit v1.2.3 From cfa21f812339fce7531eb9f27f6d2c5287de661e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:33:15 +0100 Subject: Remove dead code. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 10 ++-------- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index f80f3b410..2c6798d63 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -71,14 +71,8 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc, 1); - - kc = blocking.kc(); - mc = (std::min)(size,blocking.mc()); - nc = (std::min)(size,blocking.nc()); + Index kc = blocking.kc(); + Index mc = (std::min)(size,blocking.mc()); // !!! mc must be a multiple of nr: if(mc > Traits::nr) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index ba8ee1d53..da6f82abc 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -344,10 +344,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Date: Tue, 26 Jan 2016 23:34:48 +0100 Subject: Doc: add flip* and arrayfun MatLab equivalent. --- doc/AsciiQuickReference.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index c604e575c..9599df60b 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -44,7 +44,7 @@ C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' VectorXi::LinSpaced(((hi-low)/step)+1, // low:step:hi - low,low+step*(size-1)) + low,low+step*(size-1)) // // Matrix slicing and blocks. All expressions listed here are read/write. @@ -94,6 +94,8 @@ R.transpose() // R.' or conj(R') // Read-write R.diagonal() // diag(R) // Read-write x.asDiagonal() // diag(x) R.transpose().colwise().reverse() // rot90(R) // Read-write +R.rowwise().reverse() // fliplr(R) +R.colwise().reverse() // flipud(R) R.replicate(i,j) // repmat(P,i,j) @@ -139,6 +141,7 @@ R.cwiseAbs2() // abs(P.^2) R.array().abs2() // abs(P.^2) (R.array() < s).select(P,Q ); // (R < s ? P : Q) R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) +R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P) // with: scalar func(const scalar &x); // Reductions. -- cgit v1.2.3 From 412bb5a631a3ff32ff7256e9473f97e40023b2e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:35:30 +0100 Subject: Remove redundant test. --- test/nomalloc.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 077ecae55..743ee6aa8 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -81,7 +81,6 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1.row(0),-1); // The following fancy matrix-matrix products are not safe yet regarding static allocation -// m1.col(1) += m1.template triangularView() * m2.col(0); m2.template selfadjointView().rankUpdate(m1); m2 += m2.template triangularView() * m1; m2.template triangularView() = m2 * m2; -- cgit v1.2.3 From 02db1228ed9ca3728ae0685a5e1602fe7299ae50 Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 26 Jan 2016 23:41:01 -0700 Subject: Add constructor for long types. --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 4f2adb671..19352eb5e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -40,6 +40,12 @@ struct TensorUInt128 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(unsigned int x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(long x) : high(0), low(x) { + eigen_assert(x >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(unsigned long x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); } -- cgit v1.2.3 From fecea26d93a9e1f4178986791943796363391a06 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 15:55:15 +0100 Subject: Extend doc on shifting strategy --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index adf3686b4..e45c272b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -37,6 +37,8 @@ namespace Eigen { * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$. + * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by + * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ template Date: Wed, 27 Jan 2016 17:11:39 +0100 Subject: Add meta_least_common_multiple helper. --- Eigen/src/Core/util/Meta.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 617ba0a65..e3e6d763d 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -326,6 +326,22 @@ class meta_sqrt template class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; + +/** \internal Computes the least common multiple of two positive integer A and B + * at compile-time. It implements a naive algorithm testing all multiples of A. + * It thus works better if A>=B. + */ +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; +template +struct meta_least_common_multiple +{ + enum { ret = A*K }; +}; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { -- cgit v1.2.3 From 9801c959e685a0341fa35c5843ad16a150018f39 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:12:25 +0100 Subject: Fix tri = complex * real product, and add respective unit test. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 +- test/mixingtypes.cpp | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 2c6798d63..831089dee 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -140,7 +140,7 @@ struct tribb_kernel typedef typename Traits::ResScalar ResScalar; enum { - BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + BlockSize = meta_least_common_multiple::ret }; void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 32d9d0be9..0a583897d 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -44,6 +44,7 @@ template void mixingtypes(int size = SizeAtCompileType) Mat_d md = mf.template cast(); Mat_cf mcf = Mat_cf::Random(size,size); Mat_cd mcd = mcf.template cast >(); + Mat_cd rcd = mcd; Vec_f vf = Vec_f::Random(size,1); Vec_d vd = vf.template cast(); Vec_cf vcf = Vec_cf::Random(size,1); @@ -103,7 +104,6 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast >()); // check matrix-matrix products - VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast().eval()*mcd); VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); @@ -147,6 +147,16 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast().eval()); VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); + + rcd.setZero(); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * mcd * md), + Mat_cd((sd * mcd * md.template cast().eval()).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * md * mcd), + Mat_cd((sd * md.template cast().eval() * mcd).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = scd * mcd * md), + Mat_cd((scd * mcd * md.template cast().eval()).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = scd * md * mcd), + Mat_cd((scd * md.template cast().eval() * mcd).template triangularView())); } void test_mixingtypes() -- cgit v1.2.3 From 6da5d87f9258c27a1fb142d7cb5ec915497fbeeb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:26:48 +0100 Subject: add nomalloc unit test for rank2 updates --- test/nomalloc.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 743ee6aa8..50756c2fb 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -78,7 +78,8 @@ template void nomalloc(const MatrixType& m) VERIFY_IS_APPROX(m2,m2); m2.template selfadjointView().rankUpdate(m1.col(0),-1); - m2.template selfadjointView().rankUpdate(m1.row(0),-1); + m2.template selfadjointView().rankUpdate(m1.row(0),-1); + m2.template selfadjointView().rankUpdate(m1.col(0), m1.col(0)); // rank-2 // The following fancy matrix-matrix products are not safe yet regarding static allocation m2.template selfadjointView().rankUpdate(m1); -- cgit v1.2.3 From 9ac8e8c6a17c596de3c8afb1a4fc2fc198cb8323 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:29:53 +0100 Subject: Extend mixing type unit test with trmv, and the following not yet supported products: trmm, symv, symm --- test/mixingtypes.cpp | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 0a583897d..a3b469af8 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -108,19 +108,19 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast()); - + VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast()*mcf); VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast()); VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast()*mcf); VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast()); - + VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast().eval().adjoint()*mcd); VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast()); VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast().eval().adjoint()*mcd.adjoint()); VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast().adjoint()); VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast().eval()*mcd.adjoint()); VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast().adjoint()); - + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast().eval().adjoint()*mcf); VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast()); VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast().eval().adjoint()*mcf.adjoint()); @@ -148,6 +148,29 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); + VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView(), scd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + + // Not supported yet: trmm +// VERIFY_IS_APPROX(sd*mcd*md.template triangularView(), sd*mcd*md.template cast().eval().template triangularView()); +// VERIFY_IS_APPROX(scd*mcd*md.template triangularView(), scd*mcd*md.template cast().eval().template triangularView()); +// VERIFY_IS_APPROX(sd*md*mcd.template triangularView(), sd*md.template cast().eval()*mcd.template triangularView()); +// VERIFY_IS_APPROX(scd*md*mcd.template triangularView(), scd*md.template cast().eval()*mcd.template triangularView()); + + // Not supported yet: symv +// VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView(), sd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView(), scd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView(), sd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); +// VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView(), scd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); + + // Not supported yet: symm +// VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView(), sd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView(), scd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView(), sd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); +// VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView(), scd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); + rcd.setZero(); VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * mcd * md), Mat_cd((sd * mcd * md.template cast().eval()).template triangularView())); -- cgit v1.2.3 From 9aa6fae123053cac30ca55ccaf9f1832d30e4b99 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 18:03:51 +0100 Subject: bug #1154: move to dynamic scheduling for spmv products. --- Eigen/src/SparseCore/SparseDenseProduct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 87c946b9b..c9da8a2bb 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -48,7 +48,7 @@ struct sparse_time_dense_product_impl1 && lhsEval.nonZerosEstimate() > 20000) { - #pragma omp parallel for schedule(static) num_threads(threads) + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) for(Index i=0; i Date: Wed, 27 Jan 2016 18:34:42 +0100 Subject: bug #1156: fix several function declarations whose arguments were passed by value instead of being passed by reference --- Eigen/src/Core/MathFunctions.h | 6 +++--- Eigen/src/Geometry/ParametrizedLine.h | 2 +- Eigen/src/Geometry/Translation.h | 2 +- Eigen/src/SparseCore/SparseView.h | 2 +- unsupported/Eigen/AlignedVector3 | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 1c7b28a4b..e47070a46 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1080,21 +1080,21 @@ struct scalar_fuzzy_impl : scalar_fuzzy_default_impl:: template EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::template isMuchSmallerThan(x, y, precision); } template EIGEN_DEVICE_FUNC inline bool isApprox(const Scalar& x, const Scalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::isApprox(x, y, precision); } template EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::isApproxOrLessThan(x, y, precision); } diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h index fdcd69760..c43dce773 100644 --- a/Eigen/src/Geometry/ParametrizedLine.h +++ b/Eigen/src/Geometry/ParametrizedLine.h @@ -129,7 +129,7 @@ public: * determined by \a prec. * * \sa MatrixBase::isApprox() */ - bool isApprox(const ParametrizedLine& other, typename NumTraits::Real prec = NumTraits::dummy_precision()) const + bool isApprox(const ParametrizedLine& other, const typename NumTraits::Real& prec = NumTraits::dummy_precision()) const { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); } protected: diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 87ea445e9..82d7777f0 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -162,7 +162,7 @@ public: * determined by \a prec. * * \sa MatrixBase::isApprox() */ - bool isApprox(const Translation& other, typename NumTraits::Real prec = NumTraits::dummy_precision()) const + bool isApprox(const Translation& other, const typename NumTraits::Real& prec = NumTraits::dummy_precision()) const { return m_coeffs.isApprox(other.m_coeffs, prec); } }; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index c945c4dab..b867877d8 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -38,7 +38,7 @@ public: typedef typename internal::remove_all::type NestedExpression; explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0), - RealScalar epsilon = NumTraits::dummy_precision()) + const RealScalar &epsilon = NumTraits::dummy_precision()) : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {} inline Index rows() const { return m_matrix.rows(); } diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3 index f5c40a189..135eec572 100644 --- a/unsupported/Eigen/AlignedVector3 +++ b/unsupported/Eigen/AlignedVector3 @@ -188,7 +188,7 @@ template class AlignedVector3 } template - inline bool isApprox(const MatrixBase& other, RealScalar eps=NumTraits::dummy_precision()) const + inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const { return m_coeffs.template head<3>().isApprox(other,eps); } -- cgit v1.2.3 From c8d94ae944407c05ae7600347afb6a532783c962 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 27 Jan 2016 09:52:29 -0800 Subject: digamma special function: merge shared code. Moved type-specific code into a helper class digamma_impl_maybe_poly. --- Eigen/src/Core/SpecialFunctions.h | 217 ++++++++++++++------------------------ 1 file changed, 81 insertions(+), 136 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index bd022946c..21583e6f5 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -134,8 +134,23 @@ struct lgamma_impl { * Implementation of digamma (psi) * ****************************************************************************/ +#ifdef EIGEN_HAS_C99_MATH + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ template -struct digamma_impl { +struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), @@ -144,72 +159,11 @@ struct digamma_impl { } }; -template -struct digamma_retval { - typedef Scalar type; -}; -#ifdef EIGEN_HAS_C99_MATH template <> -struct digamma_impl { - /* - * Psi (digamma) function (modified for Eigen) - * - * - * SYNOPSIS: - * - * float x, y, psif(); - * - * y = psif( x ); - * - * - * DESCRIPTION: - * - * d - - * psi(x) = -- ln | (x) - * dx - * - * is the logarithmic derivative of the gamma function. - * For integer x, - * n-1 - * - - * psi(n) = -EUL + > 1/k. - * - - * k=1 - * - * If x is negative, it is transformed to a positive argument by the - * reflection formula psi(1-x) = psi(x) + pi cot(pi x). - * For general positive x, the argument is made greater than 10 - * using the recurrence psi(x+1) = psi(x) + 1/x. - * Then the following asymptotic expansion is applied: - * - * inf. B - * - 2k - * psi(x) = log(x) - 1/2x - > ------- - * - 2k - * k=1 2k x - * - * where the B2k are Bernoulli numbers. - * - * ACCURACY: - * Absolute error, relative when |psi| > 1 : - * arithmetic domain # trials peak rms - * IEEE -33,0 30000 8.2e-7 1.2e-7 - * IEEE 0,33 100000 7.3e-7 7.7e-8 - * - * ERROR MESSAGES: - * message condition value returned - * psi singularity x integer <=0 INFINITY - */ +struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC - static float run(float xx) { - float p, q, nz, x, s, w, y, z; - bool negative; - - // Some necessary constants - const float m_pif = 3.141592653589793238; - const float maxnumf = std::numeric_limits::infinity(); - + static EIGEN_STRONG_INLINE float run(const float s) { const float A[] = { -4.16666666666666666667E-3, 3.96825396825396825397E-3, @@ -217,53 +171,49 @@ struct digamma_impl { 8.33333333333333333333E-2 }; - x = xx; - nz = 0.0f; - negative = 0; - if (x <= 0.0f) { - negative = 1; - q = x; - p = ::floor(q); - if (p == q) { - return (maxnumf); - } - nz = q - p; - if (nz != 0.5f) { - if (nz > 0.5f) { - p += 1.0f; - nz = q - p; - } - nz = m_pif / ::tan(m_pif * nz); - } else { - nz = 0.0f; - } - x = 1.0f - x; - } - - /* use the recurrence psi(x+1) = psi(x) + 1/x. */ - s = x; - w = 0.0f; - while (s < 10.0f) { - w += 1.0f / s; - s += 1.0f; - } - + float z; if (s < 1.0e8f) { z = 1.0f / (s * s); - y = z * cephes::polevl::run(z, A); - } else - y = 0.0f; + return z * cephes::polevl::run(z, A); + } else return 0.0f; + } +}; - y = ::log(s) - (0.5f / s) - y - w; +template <> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; - return (negative) ? y - nz : y; + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * cephes::polevl::run(z, A); + } + else return 0.0; } }; -template <> -struct digamma_impl { +#endif // EIGEN_HAS_C99_MATH + +template +struct digamma_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template +struct digamma_impl { EIGEN_DEVICE_FUNC - static double run(double x) { + static Scalar run(Scalar x) { /* * * Psi (digamma) function (modified for Eigen) @@ -304,38 +254,38 @@ struct digamma_impl { * * where the B2k are Bernoulli numbers. * - * ACCURACY: + * ACCURACY (float): * Relative error (except absolute when |psi| < 1): * arithmetic domain # trials peak rms * IEEE 0,30 30000 1.3e-15 1.4e-16 * IEEE -30,0 40000 1.5e-15 2.2e-16 * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * * ERROR MESSAGES: * message condition value returned * psi singularity x integer <=0 INFINITY */ - double p, q, nz, s, w, y, z; + Scalar p, q, nz, s, w, y; bool negative; - const double A[] = { - 8.33333333333333333333E-2, - -2.10927960927960927961E-2, - 7.57575757575757575758E-3, - -4.16666666666666666667E-3, - 3.96825396825396825397E-3, - -8.33333333333333333333E-3, - 8.33333333333333333333E-2 - }; - - const double maxnum = std::numeric_limits::infinity(); - const double m_pi = 3.14159265358979323846; + const Scalar maxnum = std::numeric_limits::infinity(); + const Scalar m_pi = 3.14159265358979323846; negative = 0; nz = 0.0; - if (x <= 0.0) { - negative = 1; + const Scalar zero = 0.0; + const Scalar one = 1.0; + const Scalar half = 0.5; + + if (x <= zero) { + negative = one; q = x; p = ::floor(q); if (p == q) { @@ -345,41 +295,36 @@ struct digamma_impl { * by subtracting the nearest integer from x */ nz = q - p; - if (nz != 0.5) { - if (nz > 0.5) { - p += 1.0; + if (nz != half) { + if (nz > half) { + p += one; nz = q - p; } nz = m_pi / ::tan(m_pi * nz); } else { - nz = 0.0; + nz = zero; } - x = 1.0 - x; + x = one - x; } /* use the recurrence psi(x+1) = psi(x) + 1/x. */ s = x; - w = 0.0; - while (s < 10.0) { - w += 1.0 / s; - s += 1.0; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; } - if (s < 1.0e17) { - z = 1.0 / (s * s); - y = z * cephes::polevl::run(z, A); - } - else - y = 0.0; + y = digamma_impl_maybe_poly::run(s); - y = ::log(s) - (0.5 / s) - y - w; + y = ::log(s) - (half / s) - y - w; return (negative) ? y - nz : y; } }; -#endif +#endif // EIGEN_HAS_C99_MATH /**************************************************************************** * Implementation of erf * -- cgit v1.2.3 From 5973bcf939be278f81b20d6250405aaaa0791b9d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 12:04:42 -0800 Subject: Properly specify the namespace when calling cout/endl --- unsupported/test/cxx11_tensor_contract_cuda.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp index 035a093e6..ac447dd7b 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cpp +++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp @@ -24,7 +24,7 @@ typedef Tensor::DimensionPair DimPair; template static void test_cuda_contraction(int m_size, int k_size, int n_size) { - cout<<"Calling with ("<= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << std::endl; assert(false); } } @@ -83,7 +83,7 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { - cout<<"Calling contraction tests"<(128, 128, 128)); CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { -- cgit v1.2.3 From 9dfbd4fe8d752f96be55dc0d847aa6df172d3f7e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 12:22:17 -0800 Subject: Made the cuda tests compile using make check --- unsupported/test/CMakeLists.txt | 28 +- unsupported/test/cxx11_tensor_argmax_cuda.cpp | 241 -------- unsupported/test/cxx11_tensor_argmax_cuda.cu | 241 ++++++++ unsupported/test/cxx11_tensor_contract_cuda.cpp | 120 ---- unsupported/test/cxx11_tensor_contract_cuda.cu | 120 ++++ unsupported/test/cxx11_tensor_cuda.cpp | 664 ----------------------- unsupported/test/cxx11_tensor_cuda.cu | 664 +++++++++++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 388 ------------- unsupported/test/cxx11_tensor_device.cu | 388 +++++++++++++ unsupported/test/cxx11_tensor_random_cuda.cpp | 35 -- unsupported/test/cxx11_tensor_random_cuda.cu | 35 ++ unsupported/test/cxx11_tensor_reduction.cu | 56 ++ unsupported/test/cxx11_tensor_reduction_cuda.cpp | 56 -- 13 files changed, 1524 insertions(+), 1512 deletions(-) delete mode 100644 unsupported/test/cxx11_tensor_argmax_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_argmax_cuda.cu delete mode 100644 unsupported/test/cxx11_tensor_contract_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_contract_cuda.cu delete mode 100644 unsupported/test/cxx11_tensor_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_cuda.cu delete mode 100644 unsupported/test/cxx11_tensor_device.cpp create mode 100644 unsupported/test/cxx11_tensor_device.cu delete mode 100644 unsupported/test/cxx11_tensor_random_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_random_cuda.cu create mode 100644 unsupported/test/cxx11_tensor_reduction.cu delete mode 100644 unsupported/test/cxx11_tensor_reduction_cuda.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97257b183..5c383aab6 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -148,12 +148,24 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_fft "-std=c++0x") ei_add_test(cxx11_tensor_ifft "-std=c++0x") - # These tests needs nvcc -# ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_random_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x") - endif() + +# These tests needs nvcc +find_package(CUDA 7) +if(CUDA_FOUND) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + ei_add_test(cxx11_tensor_device "-std=c++11") + ei_add_test(cxx11_tensor_cuda "-std=c++11") + ei_add_test(cxx11_tensor_contract_cuda "-std=c++11") + ei_add_test(cxx11_tensor_reduction_cuda "-std=c++11") + ei_add_test(cxx11_tensor_random_cuda "-std=c++11") + ei_add_test(cxx11_tensor_argmax_cuda "-std=c++11 -I/opt-cuda-7.0/include") + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) +endif(CUDA_FOUND) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cpp deleted file mode 100644 index d37490d15..000000000 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp +++ /dev/null @@ -1,241 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// TODO(mdevin): Free the cuda memory. - -#define EIGEN_TEST_FUNC cxx11_tensor_cuda -#define EIGEN_USE_GPU - -#include "main.h" -#include - -using Eigen::Tensor; - -template -void test_cuda_simple_argmax() -{ - Tensor in(Eigen::array(72,53,97)); - Tensor out_max(Eigen::array(1)); - Tensor out_min(Eigen::array(1)); - in.setRandom(); - in *= in.constant(100.0); - in(0, 0, 0) = -1000.0; - in(71, 52, 96) = 1000.0; - - std::size_t in_bytes = in.size() * sizeof(double); - std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); - - double* d_in; - DenseIndex* d_out_max; - DenseIndex* d_out_min; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out_max), out_bytes); - cudaMalloc((void**)(&d_out_min), out_bytes); - - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(72,53,97)); - Eigen::TensorMap, Aligned > gpu_out_max(d_out_max, Eigen::array(1)); - Eigen::TensorMap, Aligned > gpu_out_min(d_out_min, Eigen::array(1)); - - gpu_out_max.device(gpu_device) = gpu_in.argmax(); - gpu_out_min.device(gpu_device) = gpu_in.argmin(); - - assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - VERIFY_IS_EQUAL(out_max(Eigen::array(0)), 72*53*97 - 1); - VERIFY_IS_EQUAL(out_min(Eigen::array(0)), 0); -} - -template -void test_cuda_argmax_dim() -{ - Tensor tensor(2,3,5,7); - std::vector dims; - dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); - - for (int dim = 0; dim < 4; ++dim) { - tensor.setRandom(); - tensor = (tensor + tensor.constant(0.5)).log(); - - array out_shape; - for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; - - Tensor tensor_arg(out_shape); - - array ix; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != 0) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix) = 10.0; - } - } - } - } - - std::size_t in_bytes = tensor.size() * sizeof(float); - std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); - - float* d_in; - DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); - Eigen::TensorMap, Aligned > gpu_out(d_out, out_shape); - - gpu_out.device(gpu_device) = gpu_in.argmax(dim); - - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - VERIFY_IS_EQUAL(tensor_arg.dimensions().TotalSize(), - size_t(2*3*5*7 / tensor.dimension(dim))); - - for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { - // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); - } - - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != tensor.dimension(dim) - 1) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix) = 20.0; - } - } - } - } - - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); - - gpu_out.device(gpu_device) = gpu_in.argmax(dim); - - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { - // Expect max to be in the last index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); - } - } -} - -template -void test_cuda_argmin_dim() -{ - Tensor tensor(2,3,5,7); - std::vector dims; - dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); - - for (int dim = 0; dim < 4; ++dim) { - tensor.setRandom(); - tensor = (tensor + tensor.constant(0.5)).log(); - - array out_shape; - for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; - - Tensor tensor_arg(out_shape); - - array ix; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != 0) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix) = -10.0; - } - } - } - } - - std::size_t in_bytes = tensor.size() * sizeof(float); - std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); - - float* d_in; - DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); - Eigen::TensorMap, Aligned > gpu_out(d_out, out_shape); - - gpu_out.device(gpu_device) = gpu_in.argmin(dim); - - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - VERIFY_IS_EQUAL(tensor_arg.dimensions().TotalSize(), - size_t(2*3*5*7 / tensor.dimension(dim))); - - for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { - // Expect min to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); - } - - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != tensor.dimension(dim) - 1) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix) = -20.0; - } - } - } - } - - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); - - gpu_out.device(gpu_device) = gpu_in.argmin(dim); - - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { - // Expect max to be in the last index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); - } - } -} - -void test_cxx11_tensor_cuda() -{ - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); -} diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu new file mode 100644 index 000000000..d37490d15 --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -0,0 +1,241 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// TODO(mdevin): Free the cuda memory. + +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_USE_GPU + +#include "main.h" +#include + +using Eigen::Tensor; + +template +void test_cuda_simple_argmax() +{ + Tensor in(Eigen::array(72,53,97)); + Tensor out_max(Eigen::array(1)); + Tensor out_min(Eigen::array(1)); + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(71, 52, 96) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(double); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + double* d_in; + DenseIndex* d_out_max; + DenseIndex* d_out_min; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out_max), out_bytes); + cudaMalloc((void**)(&d_out_min), out_bytes); + + cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(72,53,97)); + Eigen::TensorMap, Aligned > gpu_out_max(d_out_max, Eigen::array(1)); + Eigen::TensorMap, Aligned > gpu_out_min(d_out_min, Eigen::array(1)); + + gpu_out_max.device(gpu_device) = gpu_in.argmax(); + gpu_out_min.device(gpu_device) = gpu_in.argmin(); + + assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(out_max(Eigen::array(0)), 72*53*97 - 1); + VERIFY_IS_EQUAL(out_min(Eigen::array(0)), 0); +} + +template +void test_cuda_argmax_dim() +{ + Tensor tensor(2,3,5,7); + std::vector dims; + dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); + + for (int dim = 0; dim < 4; ++dim) { + tensor.setRandom(); + tensor = (tensor + tensor.constant(0.5)).log(); + + array out_shape; + for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != 0) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix) = 10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(float); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + float* d_in; + DenseIndex* d_out; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); + Eigen::TensorMap, Aligned > gpu_out(d_out, out_shape); + + gpu_out.device(gpu_device) = gpu_in.argmax(dim); + + assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(tensor_arg.dimensions().TotalSize(), + size_t(2*3*5*7 / tensor.dimension(dim))); + + for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != tensor.dimension(dim) - 1) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix) = 20.0; + } + } + } + } + + cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + + gpu_out.device(gpu_device) = gpu_in.argmax(dim); + + assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + } +} + +template +void test_cuda_argmin_dim() +{ + Tensor tensor(2,3,5,7); + std::vector dims; + dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); + + for (int dim = 0; dim < 4; ++dim) { + tensor.setRandom(); + tensor = (tensor + tensor.constant(0.5)).log(); + + array out_shape; + for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != 0) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix) = -10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(float); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + float* d_in; + DenseIndex* d_out; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); + Eigen::TensorMap, Aligned > gpu_out(d_out, out_shape); + + gpu_out.device(gpu_device) = gpu_in.argmin(dim); + + assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(tensor_arg.dimensions().TotalSize(), + size_t(2*3*5*7 / tensor.dimension(dim))); + + for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { + // Expect min to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != tensor.dimension(dim) - 1) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix) = -20.0; + } + } + } + } + + cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + + gpu_out.device(gpu_device) = gpu_in.argmin(dim); + + assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (size_t n = 0; n < tensor_arg.dimensions().TotalSize(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST(test_cuda_simple_argmax()); + CALL_SUBTEST(test_cuda_simple_argmax()); + CALL_SUBTEST(test_cuda_argmax_dim()); + CALL_SUBTEST(test_cuda_argmax_dim()); + CALL_SUBTEST(test_cuda_argmin_dim()); + CALL_SUBTEST(test_cuda_argmin_dim()); +} diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp deleted file mode 100644 index ac447dd7b..000000000 --- a/unsupported/test/cxx11_tensor_contract_cuda.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2014 Navdeep Jaitly -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - - -#include "main.h" -#include - -using Eigen::Tensor; -typedef Tensor::DimensionPair DimPair; - -template -static void test_cuda_contraction(int m_size, int k_size, int n_size) -{ - std::cout<<"Calling with ("< t_left(Eigen::array(m_size, k_size)); - Tensor t_right(Eigen::array(k_size, n_size)); - Tensor t_result(Eigen::array(m_size, n_size)); - Tensor t_result_gpu(Eigen::array(m_size, n_size)); - Eigen::array dims(DimPair(1, 0)); - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); - - float* d_t_left; - float* d_t_right; - float* d_t_result; - - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); - - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > - gpu_t_left(d_t_left, Eigen::array(m_size, k_size)); - Eigen::TensorMap > - gpu_t_right(d_t_right, Eigen::array(k_size, n_size)); - Eigen::TensorMap > - gpu_t_result(d_t_result, Eigen::array(m_size, n_size)); - - - gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - t_result = t_left.contract(t_right, dims); - - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); - for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { - std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << std::endl; - assert(false); - } - } - - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); -} - - -void test_cxx11_tensor_cuda() -{ - std::cout<<"Calling contraction tests"<(128, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); - for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); - } - for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); - } - for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); - } - - int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - - for (int i = 0; i <15; i++) - for (int j = 0; j < 15; j++) - for (int k = 0; k < 17; k++) { - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - } -} diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu new file mode 100644 index 000000000..ac447dd7b --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -0,0 +1,120 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2014 Navdeep Jaitly +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +typedef Tensor::DimensionPair DimPair; + +template +static void test_cuda_contraction(int m_size, int k_size, int n_size) +{ + std::cout<<"Calling with ("< t_left(Eigen::array(m_size, k_size)); + Tensor t_right(Eigen::array(k_size, n_size)); + Tensor t_result(Eigen::array(m_size, n_size)); + Tensor t_result_gpu(Eigen::array(m_size, n_size)); + Eigen::array dims(DimPair(1, 0)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, Eigen::array(m_size, k_size)); + Eigen::TensorMap > + gpu_t_right(d_t_right, Eigen::array(k_size, n_size)); + Eigen::TensorMap > + gpu_t_result(d_t_result, Eigen::array(m_size, n_size)); + + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << std::endl; + assert(false); + } + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_cuda() +{ + std::cout<<"Calling contraction tests"<(128, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + } + + int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i <15; i++) + for (int j = 0; j < 15; j++) + for (int k = 0; k < 17; k++) { + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + } +} diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp deleted file mode 100644 index 49e1894ab..000000000 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ /dev/null @@ -1,664 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// TODO(mdevin): Free the cuda memory. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - - -#include "main.h" -#include - -using Eigen::Tensor; - -void test_cuda_elementwise_small() { - Tensor in1(Eigen::array(2)); - Tensor in2(Eigen::array(2)); - Tensor out(Eigen::array(2)); - in1.setRandom(); - in2.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap, Eigen::Aligned> gpu_in1( - d_in1, Eigen::array(2)); - Eigen::TensorMap, Eigen::Aligned> gpu_in2( - d_in2, Eigen::array(2)); - Eigen::TensorMap, Eigen::Aligned> gpu_out( - d_out, Eigen::array(2)); - - gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 2; ++i) { - VERIFY_IS_APPROX( - out(Eigen::array(i)), - in1(Eigen::array(i)) + in2(Eigen::array(i))); - } -} - -void test_cuda_elementwise() -{ - Tensor in1(Eigen::array(72,53,97)); - Tensor in2(Eigen::array(72,53,97)); - Tensor in3(Eigen::array(72,53,97)); - Tensor out(Eigen::array(72,53,97)); - in1.setRandom(); - in2.setRandom(); - in3.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t in3_bytes = in3.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_in3; - float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_in3), in3_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97)); - Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(72,53,97)); - Eigen::TensorMap > gpu_in3(d_in3, Eigen::array(72,53,97)); - Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); - - gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 53; ++j) { - for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * in3(Eigen::array(i,j,k))); - } - } - } -} - -void test_cuda_reduction() -{ - Tensor in1(72,53,97,113); - Tensor out(72,97); - in1.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_in1(d_in1, 72,53,97,113); - Eigen::TensorMap > gpu_out(d_out, 72,97); - - array reduction_axis; - reduction_axis[0] = 1; - reduction_axis[1] = 3; - - gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - float expected = 0; - for (int k = 0; k < 53; ++k) { - for (int l = 0; l < 113; ++l) { - expected = - std::max(expected, in1(i, k, j, l)); - } - } - VERIFY_IS_APPROX(out(i,j), expected); - } - } -} - -template -static void test_cuda_contraction() -{ - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - Tensor t_left(6, 50, 3, 31); - Tensor t_right(Eigen::array(3, 31, 7, 20, 1)); - Tensor t_result(Eigen::array(6, 50, 7, 20, 1)); - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); - - float* d_t_left; - float* d_t_right; - float* d_t_result; - - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); - - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_t_left(d_t_left, 6, 50, 3, 31); - Eigen::TensorMap > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); - Eigen::TensorMap > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); - - typedef Eigen::Map > MapXf; - MapXf m_left(t_left.data(), 300, 93); - MapXf m_right(t_right.data(), 93, 140); - Eigen::Matrix m_result(300, 140); - - typedef Tensor::DimensionPair DimPair; - Eigen::array dims; - dims[0] = DimPair(2, 0); - dims[1] = DimPair(3, 1); - - m_result = m_left * m_right; - gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - - cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); - - for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; - assert(false); - } - } -} - -template -static void test_cuda_convolution_1d() -{ - Tensor input(74,37,11,137); - Tensor kernel(4); - Tensor out(74,34,11,137); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_input(d_input, 74,37,11,137); - Eigen::TensorMap > gpu_kernel(d_kernel, 4); - Eigen::TensorMap > gpu_out(d_out, 74,34,11,137); - - Eigen::array dims(1); - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 34; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + - input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } -} - -static void test_cuda_convolution_inner_dim_col_major_1d() -{ - Tensor input(74,9,11,7); - Tensor kernel(4); - Tensor out(71,9,11,7); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_input(d_input,74,9,11,7); - Eigen::TensorMap > gpu_kernel(d_kernel,4); - Eigen::TensorMap > gpu_out(d_out,71,9,11,7); - - Eigen::array dims(0); - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 71; ++i) { - for (int j = 0; j < 9; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 7; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + - input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } -} - -static void test_cuda_convolution_inner_dim_row_major_1d() -{ - Tensor input(7,9,11,74); - Tensor kernel(4); - Tensor out(7,9,11,71); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_input(d_input, 7,9,11,74); - Eigen::TensorMap > gpu_kernel(d_kernel, 4); - Eigen::TensorMap > gpu_out(d_out, 7,9,11,71); - - Eigen::array dims(3); - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 7; ++i) { - for (int j = 0; j < 9; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 71; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + - input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } -} - -template -static void test_cuda_convolution_2d() -{ - Tensor input(74,37,11,137); - Tensor kernel(3,4); - Tensor out(74,35,8,137); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_input(d_input,74,37,11,137); - Eigen::TensorMap > gpu_kernel(d_kernel,3,4); - Eigen::TensorMap > gpu_out(d_out,74,35,8,137); - - Eigen::array dims(1,2); - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j+0,k+0,l) * kernel(0,0) + - input(i,j+1,k+0,l) * kernel(1,0) + - input(i,j+2,k+0,l) * kernel(2,0) + - input(i,j+0,k+1,l) * kernel(0,1) + - input(i,j+1,k+1,l) * kernel(1,1) + - input(i,j+2,k+1,l) * kernel(2,1) + - input(i,j+0,k+2,l) * kernel(0,2) + - input(i,j+1,k+2,l) * kernel(1,2) + - input(i,j+2,k+2,l) * kernel(2,2) + - input(i,j+0,k+3,l) * kernel(0,3) + - input(i,j+1,k+3,l) * kernel(1,3) + - input(i,j+2,k+3,l) * kernel(2,3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } -} - -template -static void test_cuda_convolution_3d() -{ - Tensor input(Eigen::array(74,37,11,137,17)); - Tensor kernel(3,4,2); - Tensor out(Eigen::array(74,35,8,136,17)); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_input(d_input,74,37,11,137,17); - Eigen::TensorMap > gpu_kernel(d_kernel,3,4,2); - Eigen::TensorMap > gpu_out(d_out,74,35,8,136,17); - - Eigen::array dims(1,2,3); - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 136; ++l) { - for (int m = 0; m < 17; ++m) { - const float result = out(i,j,k,l,m); - const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + - input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + - input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + - input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + - input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + - input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + - input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + - input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + - input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + - input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + - input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + - input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + - input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + - input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + - input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + - input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + - input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + - input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + - input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + - input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + - input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + - input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + - input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + - input(i,j+2,k+3,l+1,m) * kernel(2,3,1); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - } -} - - -template -void test_cuda_lgamma(const Scalar stddev) -{ - Tensor in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); - - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_in(d_in, 72, 97); - Eigen::TensorMap > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.lgamma(); - - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); - } - } -} - -template -void test_cuda_erf(const Scalar stddev) -{ - Tensor in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); - - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_in(d_in, 72, 97); - Eigen::TensorMap > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.erf(); - - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); - } - } -} - -template -void test_cuda_erfc(const Scalar stddev) -{ - Tensor in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); - - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap > gpu_in(d_in, 72, 97); - Eigen::TensorMap > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.erfc(); - - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); - } - } -} - -void test_cxx11_tensor_cuda() -{ - CALL_SUBTEST(test_cuda_elementwise_small()); - CALL_SUBTEST(test_cuda_elementwise()); - CALL_SUBTEST(test_cuda_reduction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_lgamma(1.0f)); - CALL_SUBTEST(test_cuda_lgamma(100.0f)); - CALL_SUBTEST(test_cuda_lgamma(0.01f)); - CALL_SUBTEST(test_cuda_lgamma(0.001f)); - CALL_SUBTEST(test_cuda_erf(1.0f)); - CALL_SUBTEST(test_cuda_erf(100.0f)); - CALL_SUBTEST(test_cuda_erf(0.01f)); - CALL_SUBTEST(test_cuda_erf(0.001f)); - CALL_SUBTEST(test_cuda_erfc(1.0f)); - // CALL_SUBTEST(test_cuda_erfc(100.0f)); - CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01f)); - CALL_SUBTEST(test_cuda_erfc(0.001f)); - CALL_SUBTEST(test_cuda_tanh(1.0)); - CALL_SUBTEST(test_cuda_tanh(100.0)); - CALL_SUBTEST(test_cuda_tanh(0.01)); - CALL_SUBTEST(test_cuda_tanh(0.001)); - CALL_SUBTEST(test_cuda_lgamma(1.0)); - CALL_SUBTEST(test_cuda_lgamma(100.0)); - CALL_SUBTEST(test_cuda_lgamma(0.01)); - CALL_SUBTEST(test_cuda_lgamma(0.001)); - CALL_SUBTEST(test_cuda_erf(1.0)); - CALL_SUBTEST(test_cuda_erf(100.0)); - CALL_SUBTEST(test_cuda_erf(0.01)); - CALL_SUBTEST(test_cuda_erf(0.001)); - CALL_SUBTEST(test_cuda_erfc(1.0)); - // CALL_SUBTEST(test_cuda_erfc(100.0)); - CALL_SUBTEST(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01)); - CALL_SUBTEST(test_cuda_erfc(0.001)); -} diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu new file mode 100644 index 000000000..49e1894ab --- /dev/null +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -0,0 +1,664 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// TODO(mdevin): Free the cuda memory. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_elementwise_small() { + Tensor in1(Eigen::array(2)); + Tensor in2(Eigen::array(2)); + Tensor out(Eigen::array(2)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, Eigen::array(2)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array(i)), + in1(Eigen::array(i)) + in2(Eigen::array(i))); + } +} + +void test_cuda_elementwise() +{ + Tensor in1(Eigen::array(72,53,97)); + Tensor in2(Eigen::array(72,53,97)); + Tensor in3(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_in3), in3_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in3(d_in3, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * in3(Eigen::array(i,j,k))); + } + } + } +} + +void test_cuda_reduction() +{ + Tensor in1(72,53,97,113); + Tensor out(72,97); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap > gpu_out(d_out, 72,97); + + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max(expected, in1(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i,j), expected); + } + } +} + +template +static void test_cuda_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(6, 50, 3, 31); + Tensor t_right(Eigen::array(3, 31, 7, 20, 1)); + Tensor t_result(Eigen::array(6, 50, 7, 20, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); + + typedef Eigen::Map > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result(300, 140); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + assert(false); + } + } +} + +template +static void test_cuda_convolution_1d() +{ + Tensor input(74,37,11,137); + Tensor kernel(4); + Tensor out(74,34,11,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap > gpu_kernel(d_kernel, 4); + Eigen::TensorMap > gpu_out(d_out, 74,34,11,137); + + Eigen::array dims(1); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + +static void test_cuda_convolution_inner_dim_col_major_1d() +{ + Tensor input(74,9,11,7); + Tensor kernel(4); + Tensor out(71,9,11,7); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap > gpu_kernel(d_kernel,4); + Eigen::TensorMap > gpu_out(d_out,71,9,11,7); + + Eigen::array dims(0); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + +static void test_cuda_convolution_inner_dim_row_major_1d() +{ + Tensor input(7,9,11,74); + Tensor kernel(4); + Tensor out(7,9,11,71); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap > gpu_kernel(d_kernel, 4); + Eigen::TensorMap > gpu_out(d_out, 7,9,11,71); + + Eigen::array dims(3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + +template +static void test_cuda_convolution_2d() +{ + Tensor input(74,37,11,137); + Tensor kernel(3,4); + Tensor out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap > gpu_out(d_out,74,35,8,137); + + Eigen::array dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + +template +static void test_cuda_convolution_3d() +{ + Tensor input(Eigen::array(74,37,11,137,17)); + Tensor kernel(3,4,2); + Tensor out(Eigen::array(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap > gpu_out(d_out,74,35,8,136,17); + + Eigen::array dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } +} + + +template +void test_cuda_lgamma(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.lgamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); + } + } +} + +template +void test_cuda_erf(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erf(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); + } + } +} + +template +void test_cuda_erfc(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erfc(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); + } + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST(test_cuda_elementwise_small()); + CALL_SUBTEST(test_cuda_elementwise()); + CALL_SUBTEST(test_cuda_reduction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_convolution_1d()); + CALL_SUBTEST(test_cuda_convolution_1d()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST(test_cuda_convolution_2d()); + CALL_SUBTEST(test_cuda_convolution_2d()); + CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_lgamma(1.0f)); + CALL_SUBTEST(test_cuda_lgamma(100.0f)); + CALL_SUBTEST(test_cuda_lgamma(0.01f)); + CALL_SUBTEST(test_cuda_lgamma(0.001f)); + CALL_SUBTEST(test_cuda_erf(1.0f)); + CALL_SUBTEST(test_cuda_erf(100.0f)); + CALL_SUBTEST(test_cuda_erf(0.01f)); + CALL_SUBTEST(test_cuda_erf(0.001f)); + CALL_SUBTEST(test_cuda_erfc(1.0f)); + // CALL_SUBTEST(test_cuda_erfc(100.0f)); + CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST(test_cuda_erfc(0.01f)); + CALL_SUBTEST(test_cuda_erfc(0.001f)); + CALL_SUBTEST(test_cuda_tanh(1.0)); + CALL_SUBTEST(test_cuda_tanh(100.0)); + CALL_SUBTEST(test_cuda_tanh(0.01)); + CALL_SUBTEST(test_cuda_tanh(0.001)); + CALL_SUBTEST(test_cuda_lgamma(1.0)); + CALL_SUBTEST(test_cuda_lgamma(100.0)); + CALL_SUBTEST(test_cuda_lgamma(0.01)); + CALL_SUBTEST(test_cuda_lgamma(0.001)); + CALL_SUBTEST(test_cuda_erf(1.0)); + CALL_SUBTEST(test_cuda_erf(100.0)); + CALL_SUBTEST(test_cuda_erf(0.01)); + CALL_SUBTEST(test_cuda_erf(0.001)); + CALL_SUBTEST(test_cuda_erfc(1.0)); + // CALL_SUBTEST(test_cuda_erfc(100.0)); + CALL_SUBTEST(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST(test_cuda_erfc(0.01)); + CALL_SUBTEST(test_cuda_erfc(0.001)); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp deleted file mode 100644 index ed5dd7505..000000000 --- a/unsupported/test/cxx11_tensor_device.cpp +++ /dev/null @@ -1,388 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - - -#include "main.h" -#include - -using Eigen::Tensor; -using Eigen::RowMajor; - -// Context for evaluation on cpu -struct CPUContext { - CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { - kernel_1d_(0) = 3.14f; - kernel_1d_(1) = 2.7f; - - kernel_2d_(0,0) = 3.14f; - kernel_2d_(1,0) = 2.7f; - kernel_2d_(0,1) = 0.2f; - kernel_2d_(1,1) = 7.0f; - - kernel_3d_(0,0,0) = 3.14f; - kernel_3d_(0,1,0) = 2.7f; - kernel_3d_(0,0,1) = 0.2f; - kernel_3d_(0,1,1) = 7.0f; - kernel_3d_(1,0,0) = -1.0f; - kernel_3d_(1,1,0) = -0.3f; - kernel_3d_(1,0,1) = -0.7f; - kernel_3d_(1,1,1) = -0.5f; - } - - const Eigen::DefaultDevice& device() const { return cpu_device_; } - - const Eigen::Tensor& in1() const { return in1_; } - const Eigen::Tensor& in2() const { return in2_; } - Eigen::Tensor& out() { return out_; } - const Eigen::Tensor& kernel1d() const { return kernel_1d_; } - const Eigen::Tensor& kernel2d() const { return kernel_2d_; } - const Eigen::Tensor& kernel3d() const { return kernel_3d_; } - - private: - const Eigen::Tensor& in1_; - const Eigen::Tensor& in2_; - Eigen::Tensor& out_; - - Eigen::Tensor kernel_1d_; - Eigen::Tensor kernel_2d_; - Eigen::Tensor kernel_3d_; - - Eigen::DefaultDevice cpu_device_; -}; - - -// Context for evaluation on GPU -struct GPUContext { - GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); - float kernel_1d_val[] = {3.14f, 2.7f}; - assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); - - assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); - float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; - assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); - - assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); - float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; - assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); - } - ~GPUContext() { - assert(cudaFree(kernel_1d_) == cudaSuccess); - assert(cudaFree(kernel_2d_) == cudaSuccess); - assert(cudaFree(kernel_3d_) == cudaSuccess); - } - - const Eigen::GpuDevice& device() const { return gpu_device_; } - - const Eigen::TensorMap >& in1() const { return in1_; } - const Eigen::TensorMap >& in2() const { return in2_; } - Eigen::TensorMap >& out() { return out_; } - Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } - Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, 2, 2); } - Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, 2, 2, 2); } - - private: - const Eigen::TensorMap >& in1_; - const Eigen::TensorMap >& in2_; - Eigen::TensorMap >& out_; - - float* kernel_1d_; - float* kernel_2d_; - float* kernel_3d_; - - Eigen::CudaStreamDevice stream_; - Eigen::GpuDevice gpu_device_; -}; - - -// The actual expression to evaluate -template -static void test_contextual_eval(Context* context) -{ - context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); -} - -template -static void test_forced_contextual_eval(Context* context) -{ - context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); -} - -template -static void test_compound_assignment(Context* context) -{ - context->out().device(context->device()) = context->in1().constant(2.718f); - context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; -} - - -template -static void test_contraction(Context* context) -{ - Eigen::array, 2> dims; - dims[0] = std::make_pair(1, 1); - dims[1] = std::make_pair(2, 2); - - Eigen::array shape(40, 50*70); - - Eigen::DSizes indices(0,0); - Eigen::DSizes sizes(40,40); - - context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); -} - - -template -static void test_1d_convolution(Context* context) -{ - Eigen::DSizes indices(0,0,0); - Eigen::DSizes sizes(40,49,70); - - Eigen::array dims(1); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); -} - -template -static void test_2d_convolution(Context* context) -{ - Eigen::DSizes indices(0,0,0); - Eigen::DSizes sizes(40,49,69); - - Eigen::array dims(1,2); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); -} - -template -static void test_3d_convolution(Context* context) -{ - Eigen::DSizes indices(0,0,0); - Eigen::DSizes sizes(39,49,69); - - Eigen::array dims(0,1,2); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); -} - - -static void test_cpu() { - Eigen::Tensor in1(40,50,70); - Eigen::Tensor in2(40,50,70); - Eigen::Tensor out(40,50,70); - - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); - - CPUContext context(in1, in2, out); - test_contextual_eval(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_forced_contextual_eval(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); - } - } - } - - test_compound_assignment(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_contraction(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 40; ++j) { - const float result = out(i,j,0); - float expected = 0; - for (int k = 0; k < 50; ++k) { - for (int l = 0; l < 70; ++l) { - expected += in1(i, k, l) * in2(j, k, l); - } - } - VERIFY_IS_APPROX(expected, result); - } - } - - test_1d_convolution(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); - } - } - } - - test_2d_convolution(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + - (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { - continue; - } - VERIFY_IS_APPROX(expected, result); - } - } - } - - test_3d_convolution(&context); - for (int i = 0; i < 39; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + - (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + - in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { - continue; - } - VERIFY_IS_APPROX(expected, result); - } - } - } -} - -static void test_gpu() { - Eigen::Tensor in1(40,50,70); - Eigen::Tensor in2(40,50,70); - Eigen::Tensor out(40,50,70); - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - - Eigen::TensorMap > gpu_in1(d_in1, 40,50,70); - Eigen::TensorMap > gpu_in2(d_in2, 40,50,70); - Eigen::TensorMap > gpu_out(d_out, 40,50,70); - - GPUContext context(gpu_in1, gpu_in2, gpu_out); - test_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_forced_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); - } - } - } - - test_compound_assignment(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_contraction(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 40; ++j) { - const float result = out(i,j,0); - float expected = 0; - for (int k = 0; k < 50; ++k) { - for (int l = 0; l < 70; ++l) { - expected += in1(i, k, l) * in2(j, k, l); - } - } - VERIFY_IS_APPROX(expected, result); - } - } - - test_1d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); - } - } - } - - test_2d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - VERIFY_IS_APPROX(expected, result); - } - } - } - - test_3d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); - for (int i = 0; i < 39; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + - in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + - in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - VERIFY_IS_APPROX(expected, result); - } - } - } -} - - -void test_cxx11_tensor_device() -{ - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); -} diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu new file mode 100644 index 000000000..ed5dd7505 --- /dev/null +++ b/unsupported/test/cxx11_tensor_device.cu @@ -0,0 +1,388 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_device +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +// Context for evaluation on cpu +struct CPUContext { + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { + kernel_1d_(0) = 3.14f; + kernel_1d_(1) = 2.7f; + + kernel_2d_(0,0) = 3.14f; + kernel_2d_(1,0) = 2.7f; + kernel_2d_(0,1) = 0.2f; + kernel_2d_(1,1) = 7.0f; + + kernel_3d_(0,0,0) = 3.14f; + kernel_3d_(0,1,0) = 2.7f; + kernel_3d_(0,0,1) = 0.2f; + kernel_3d_(0,1,1) = 7.0f; + kernel_3d_(1,0,0) = -1.0f; + kernel_3d_(1,1,0) = -0.3f; + kernel_3d_(1,0,1) = -0.7f; + kernel_3d_(1,1,1) = -0.5f; + } + + const Eigen::DefaultDevice& device() const { return cpu_device_; } + + const Eigen::Tensor& in1() const { return in1_; } + const Eigen::Tensor& in2() const { return in2_; } + Eigen::Tensor& out() { return out_; } + const Eigen::Tensor& kernel1d() const { return kernel_1d_; } + const Eigen::Tensor& kernel2d() const { return kernel_2d_; } + const Eigen::Tensor& kernel3d() const { return kernel_3d_; } + + private: + const Eigen::Tensor& in1_; + const Eigen::Tensor& in2_; + Eigen::Tensor& out_; + + Eigen::Tensor kernel_1d_; + Eigen::Tensor kernel_2d_; + Eigen::Tensor kernel_3d_; + + Eigen::DefaultDevice cpu_device_; +}; + + +// Context for evaluation on GPU +struct GPUContext { + GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { + assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + float kernel_1d_val[] = {3.14f, 2.7f}; + assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; + assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; + assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + } + ~GPUContext() { + assert(cudaFree(kernel_1d_) == cudaSuccess); + assert(cudaFree(kernel_2d_) == cudaSuccess); + assert(cudaFree(kernel_3d_) == cudaSuccess); + } + + const Eigen::GpuDevice& device() const { return gpu_device_; } + + const Eigen::TensorMap >& in1() const { return in1_; } + const Eigen::TensorMap >& in2() const { return in2_; } + Eigen::TensorMap >& out() { return out_; } + Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } + Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, 2, 2); } + Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, 2, 2, 2); } + + private: + const Eigen::TensorMap >& in1_; + const Eigen::TensorMap >& in2_; + Eigen::TensorMap >& out_; + + float* kernel_1d_; + float* kernel_2d_; + float* kernel_3d_; + + Eigen::CudaStreamDevice stream_; + Eigen::GpuDevice gpu_device_; +}; + + +// The actual expression to evaluate +template +static void test_contextual_eval(Context* context) +{ + context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); +} + +template +static void test_forced_contextual_eval(Context* context) +{ + context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + +template +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + +template +static void test_contraction(Context* context) +{ + Eigen::array, 2> dims; + dims[0] = std::make_pair(1, 1); + dims[1] = std::make_pair(2, 2); + + Eigen::array shape(40, 50*70); + + Eigen::DSizes indices(0,0); + Eigen::DSizes sizes(40,40); + + context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); +} + + +template +static void test_1d_convolution(Context* context) +{ + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,70); + + Eigen::array dims(1); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); +} + +template +static void test_2d_convolution(Context* context) +{ + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,69); + + Eigen::array dims(1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); +} + +template +static void test_3d_convolution(Context* context) +{ + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(39,49,69); + + Eigen::array dims(0,1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); +} + + +static void test_cpu() { + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); + + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + CPUContext context(in1, in2, out); + test_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } +} + +static void test_gpu() { + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::TensorMap > gpu_in1(d_in1, 40,50,70); + Eigen::TensorMap > gpu_in2(d_in2, 40,50,70); + Eigen::TensorMap > gpu_out(d_out, 40,50,70); + + GPUContext context(gpu_in1, gpu_in2, gpu_out); + test_contextual_eval(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + + in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + VERIFY_IS_APPROX(expected, result); + } + } + } +} + + +void test_cxx11_tensor_device() +{ + CALL_SUBTEST(test_cpu()); + CALL_SUBTEST(test_gpu()); +} diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cpp deleted file mode 100644 index 5d091de15..000000000 --- a/unsupported/test/cxx11_tensor_random_cuda.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include - -static void test_default() -{ - Tensor, 1> vec(6); - vec.setRandom(); - - // Fixme: we should check that the generated numbers follow a uniform - // distribution instead. - for (int i = 1; i < 6; ++i) { - VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); - } -} - - -void test_cxx11_tensor_random_cuda() -{ - CALL_SUBTEST(test_default()); -} diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu new file mode 100644 index 000000000..5d091de15 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -0,0 +1,35 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include + +static void test_default() +{ + Tensor, 1> vec(6); + vec.setRandom(); + + // Fixme: we should check that the generated numbers follow a uniform + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + + +void test_cxx11_tensor_random_cuda() +{ + CALL_SUBTEST(test_default()); +} diff --git a/unsupported/test/cxx11_tensor_reduction.cu b/unsupported/test/cxx11_tensor_reduction.cu new file mode 100644 index 000000000..9e06eb126 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction.cu @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda +#define EIGEN_USE_GPU + +#include "main.h" +#include + + +template +static void test_full_reductions() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random(1024, 5*1024); + const int num_cols = internal::random(1024, 5*1024); + + Tensor in(num_rows, num_cols); + in.setRandom(); + + Tensor full_redux; + full_redux = in.sum(); + + std::size_t in_bytes = in.size() * sizeof(float); + std::size_t out_bytes = full_redux.size() * sizeof(float); + float* gpu_in_ptr = static_cast(gpu_device.allocate(in_bytes)); + float* gpu_out_ptr = static_cast(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.sum(); + + Tensor full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); +} + +void test_cxx11_tensor_reduction_cuda() { + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_full_reductions()); +} diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cpp deleted file mode 100644 index 9e06eb126..000000000 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda -#define EIGEN_USE_GPU - -#include "main.h" -#include - - -template -static void test_full_reductions() { - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - const int num_rows = internal::random(1024, 5*1024); - const int num_cols = internal::random(1024, 5*1024); - - Tensor in(num_rows, num_cols); - in.setRandom(); - - Tensor full_redux; - full_redux = in.sum(); - - std::size_t in_bytes = in.size() * sizeof(float); - std::size_t out_bytes = full_redux.size() * sizeof(float); - float* gpu_in_ptr = static_cast(gpu_device.allocate(in_bytes)); - float* gpu_out_ptr = static_cast(gpu_device.allocate(out_bytes)); - gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - - TensorMap > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap > out_gpu(gpu_out_ptr); - - out_gpu.device(gpu_device) = in_gpu.sum(); - - Tensor full_redux_gpu; - gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); - gpu_device.synchronize(); - - // Check that the CPU and GPU reductions return the same result. - VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); -} - -void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_full_reductions()); -} -- cgit v1.2.3 From 4865e1e73265e12d564f8b4d9069a2159f777d90 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 22:48:40 +0100 Subject: Update link to suitesparse. --- Eigen/CholmodSupport | 2 +- Eigen/SPQRSupport | 2 +- Eigen/UmfPackSupport | 2 +- Eigen/src/OrderingMethods/Amd.h | 2 +- Eigen/src/OrderingMethods/Eigen_Colamd.h | 6 +----- doc/SparseLinearSystems.dox | 6 +++--- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport index 83e2c1da4..bed8924d3 100644 --- a/Eigen/CholmodSupport +++ b/Eigen/CholmodSupport @@ -19,7 +19,7 @@ extern "C" { /** \ingroup Support_modules * \defgroup CholmodSupport_Module CholmodSupport module * - * This module provides an interface to the Cholmod library which is part of the suitesparse package. + * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport index f9489dcd8..f70390c17 100644 --- a/Eigen/SPQRSupport +++ b/Eigen/SPQRSupport @@ -17,7 +17,7 @@ /** \ingroup Support_modules * \defgroup SPQRSupport_Module SuiteSparseQR module * - * This module provides an interface to the SPQR library, which is part of the suitesparse package. + * This module provides an interface to the SPQR library, which is part of the suitesparse package. * * \code * #include diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport index 4a9f46a1e..00eec8087 100644 --- a/Eigen/UmfPackSupport +++ b/Eigen/UmfPackSupport @@ -19,7 +19,7 @@ extern "C" { /** \ingroup Support_modules * \defgroup UmfPackSupport_Module UmfPackSupport module * - * This module provides an interface to the UmfPack library which is part of the suitesparse package. + * This module provides an interface to the UmfPack library which is part of the suitesparse package. * It provides the following factorization class: * - class UmfPackLU: a multifrontal sequential LU factorization. * diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index d1d08ca57..f91ecb24e 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -8,7 +8,7 @@ NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. -http://www.cise.ufl.edu/research/sparse/CSparse +http://www.suitesparse.com CSparse is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 70c987afa..960df4a46 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -41,12 +41,8 @@ // // The colamd/symamd library is available at // -// http://www.cise.ufl.edu/research/sparse/colamd/ +// http://www.suitesparse.com -// This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h -// file. It is required by the colamd.c, colamdmex.c, and symamdmex.c -// files, and by any C code that calls the routines whose prototypes are -// listed below, or that uses the colamd/symamd definitions listed below. #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 61cc50afa..ee4f53a4e 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -65,17 +65,17 @@ They are summarized in the following tables: Requires the PaStiX package, \b CeCILL-C optimized for tough problems and symmetric patterns CholmodSupernodalLLT\link CholmodSupport_Module CholmodSupport \endlinkDirect LLt factorizationSPDFill-in reducing, Leverage fast dense algebra - Requires the SuiteSparse package, \b GPL + Requires the SuiteSparse package, \b GPL UmfPackLU\link UmfPackSupport_Module UmfPackSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra - Requires the SuiteSparse package, \b GPL + Requires the SuiteSparse package, \b GPL SuperLU\link SuperLUSupport_Module SuperLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuperLU library, (BSD-like) SPQR\link SPQRSupport_Module SPQRSupport \endlink QR factorization Any, rectangularfill-in reducing, multithreaded, fast dense algebra - requires the SuiteSparse package, \b GPL recommended for linear least-squares problems, has a rank-revealing feature + requires the SuiteSparse package, \b GPL recommended for linear least-squares problems, has a rank-revealing feature Here \c SPD means symmetric positive definite. -- cgit v1.2.3 From 55a5204319f119f3609e0522302ff24e69ab60ae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 14:46:34 -0800 Subject: Fixed the flags passed to nvcc to compile the tensor code. --- unsupported/test/CMakeLists.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5c383aab6..3a90a5371 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -157,15 +157,16 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() + set(CUDA_NVCC_FLAGS "-std=c++11 -arch compute_30") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") - set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - - ei_add_test(cxx11_tensor_device "-std=c++11") - ei_add_test(cxx11_tensor_cuda "-std=c++11") - ei_add_test(cxx11_tensor_contract_cuda "-std=c++11") - ei_add_test(cxx11_tensor_reduction_cuda "-std=c++11") - ei_add_test(cxx11_tensor_random_cuda "-std=c++11") - ei_add_test(cxx11_tensor_argmax_cuda "-std=c++11 -I/opt-cuda-7.0/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_cuda) + ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_reduction_cuda) + ei_add_test(cxx11_tensor_random_cuda) + ei_add_test(cxx11_tensor_argmax_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif(CUDA_FOUND) -- cgit v1.2.3 From 47ca9dc809801b60c1bfe49b391a37bd62eb8888 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 14:58:48 -0800 Subject: Fixed the tensor_cuda test --- unsupported/test/cxx11_tensor_cuda.cu | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 49e1894ab..79f1c5315 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -131,8 +131,7 @@ void test_cuda_reduction() cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_in1(d_in1, 72,53,97,113); @@ -189,8 +188,7 @@ static void test_cuda_contraction() cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -214,7 +212,7 @@ static void test_cuda_contraction() for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } @@ -243,8 +241,7 @@ static void test_cuda_convolution_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input, 74,37,11,137); @@ -293,8 +290,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,9,11,7); @@ -343,8 +339,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input, 7,9,11,74); @@ -394,8 +389,7 @@ static void test_cuda_convolution_2d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,37,11,137); @@ -455,8 +449,7 @@ static void test_cuda_convolution_3d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,37,11,137,17); @@ -644,10 +637,6 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST(test_cuda_erfc(0.01f)); CALL_SUBTEST(test_cuda_erfc(0.001f)); - CALL_SUBTEST(test_cuda_tanh(1.0)); - CALL_SUBTEST(test_cuda_tanh(100.0)); - CALL_SUBTEST(test_cuda_tanh(0.01)); - CALL_SUBTEST(test_cuda_tanh(0.001)); CALL_SUBTEST(test_cuda_lgamma(1.0)); CALL_SUBTEST(test_cuda_lgamma(100.0)); CALL_SUBTEST(test_cuda_lgamma(0.01)); -- cgit v1.2.3 From 291069e885dccad6059e4bda34aad30ab69cbd85 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 15:37:03 -0800 Subject: Fixed some compilation problems with nvcc + clang --- Eigen/src/Core/util/Memory.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 823e077af..415bc48cb 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -526,9 +526,9 @@ template EIGEN_DEVICE_FUNC inline void conditional_align template EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) { - static const Index ScalarSize = sizeof(Scalar); - static const Index AlignmentSize = Alignment / ScalarSize; - static const Index AlignmentMask = AlignmentSize-1; + const Index ScalarSize = sizeof(Scalar); + const Index AlignmentSize = Alignment / ScalarSize; + const Index AlignmentMask = AlignmentSize-1; if(AlignmentSize<=1) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 09ee0c2c6..7a5dfbfea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -346,7 +346,7 @@ struct InnerReducer { static const bool HasOptimizedImplementation = false; static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + eigen_assert(false && "Not implemented"); } }; @@ -356,7 +356,7 @@ struct OuterReducer { static const bool HasOptimizedImplementation = false; static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + eigen_assert(false && "Not implemented"); } }; -- cgit v1.2.3 From 4bf9eaf77aa8c9a75b5d60c781d5d86b833b93d1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 17:09:30 -0800 Subject: Deleted an invalid assertion that prevented the assignment of empty tensors. --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 1 - unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_empty.cpp | 36 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_empty.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 98631fc7f..18a916e46 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -105,7 +105,6 @@ class TensorStorage, Options_> EIGEN_DEVICE_FUNC void resize(Index size, const array& nbDimensions) { - eigen_assert(size >= 1); const Index currentSz = internal::array_prod(m_dimensions); if(size != currentSz) { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3a90a5371..d70bf2b88 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -147,6 +147,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_sugar "-std=c++0x") ei_add_test(cxx11_tensor_fft "-std=c++0x") ei_add_test(cxx11_tensor_ifft "-std=c++0x") + ei_add_test(cxx11_tensor_empty "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp new file mode 100644 index 000000000..ca03a297c --- /dev/null +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -0,0 +1,36 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_empty_tensor() +{ + Tensor source; + Tensor tgt1 = source; + Tensor tgt2; + tgt2 = source; +} + +static void test_empty_fixed_size_tensor() +{ + TensorFixedSize> source; + TensorFixedSize> tgt1 = source; + TensorFixedSize> tgt2; + tgt2 = source; +} + + +void test_cxx11_tensor_empty() +{ + CALL_SUBTEST(test_empty_tensor()); + CALL_SUBTEST(test_empty_fixed_size_tensor()); +} -- cgit v1.2.3 From 7802a6bb1cc6477810dff0e83ec90af954784612 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 09:35:37 +0100 Subject: Fix unit test filename. --- unsupported/test/CMakeLists.txt | 4 +- unsupported/test/cxx11_tensor_reduction.cu | 56 ------------------------- unsupported/test/cxx11_tensor_reduction_cuda.cu | 56 +++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 58 deletions(-) delete mode 100644 unsupported/test/cxx11_tensor_reduction.cu create mode 100644 unsupported/test/cxx11_tensor_reduction_cuda.cu diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d70bf2b88..d16c42656 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,7 +152,7 @@ if(EIGEN_TEST_CXX11) endif() # These tests needs nvcc -find_package(CUDA 7) +find_package(CUDA 7.0) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") @@ -170,4 +170,4 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_argmax_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) -endif(CUDA_FOUND) +endif() diff --git a/unsupported/test/cxx11_tensor_reduction.cu b/unsupported/test/cxx11_tensor_reduction.cu deleted file mode 100644 index 9e06eb126..000000000 --- a/unsupported/test/cxx11_tensor_reduction.cu +++ /dev/null @@ -1,56 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda -#define EIGEN_USE_GPU - -#include "main.h" -#include - - -template -static void test_full_reductions() { - - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - const int num_rows = internal::random(1024, 5*1024); - const int num_cols = internal::random(1024, 5*1024); - - Tensor in(num_rows, num_cols); - in.setRandom(); - - Tensor full_redux; - full_redux = in.sum(); - - std::size_t in_bytes = in.size() * sizeof(float); - std::size_t out_bytes = full_redux.size() * sizeof(float); - float* gpu_in_ptr = static_cast(gpu_device.allocate(in_bytes)); - float* gpu_out_ptr = static_cast(gpu_device.allocate(out_bytes)); - gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - - TensorMap > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap > out_gpu(gpu_out_ptr); - - out_gpu.device(gpu_device) = in_gpu.sum(); - - Tensor full_redux_gpu; - gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); - gpu_device.synchronize(); - - // Check that the CPU and GPU reductions return the same result. - VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); -} - -void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_full_reductions()); -} diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu new file mode 100644 index 000000000..9e06eb126 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda +#define EIGEN_USE_GPU + +#include "main.h" +#include + + +template +static void test_full_reductions() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random(1024, 5*1024); + const int num_cols = internal::random(1024, 5*1024); + + Tensor in(num_rows, num_cols); + in.setRandom(); + + Tensor full_redux; + full_redux = in.sum(); + + std::size_t in_bytes = in.size() * sizeof(float); + std::size_t out_bytes = full_redux.size() * sizeof(float); + float* gpu_in_ptr = static_cast(gpu_device.allocate(in_bytes)); + float* gpu_out_ptr = static_cast(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.sum(); + + Tensor full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); +} + +void test_cxx11_tensor_reduction_cuda() { + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_full_reductions()); +} -- cgit v1.2.3 From 2bad3e78d9e23dec7e4ad31c4ad2bdc761b8f3b6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:12:06 +0100 Subject: bug #96, bug #1006: fix by value argument in result_of. --- Eigen/src/Core/CwiseBinaryOp.h | 4 ++-- Eigen/src/Core/CwiseUnaryOp.h | 2 +- Eigen/src/Core/CwiseUnaryView.h | 2 +- Eigen/src/Core/VectorwiseOp.h | 2 +- Eigen/src/Core/util/Meta.h | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index f94629e6d..39820fd7d 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -32,8 +32,8 @@ struct traits > // we still want to handle the case when the result type is different. typedef typename result_of< BinaryOp( - typename Lhs::Scalar, - typename Rhs::Scalar + const typename Lhs::Scalar&, + const typename Rhs::Scalar& ) >::type Scalar; typedef typename cwise_promote_storage_type::StorageKind, diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 5a809cf21..22db783b5 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -19,7 +19,7 @@ struct traits > : traits { typedef typename result_of< - UnaryOp(typename XprType::Scalar) + UnaryOp(const typename XprType::Scalar&) >::type Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 5a7db2b19..a9252eddf 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -18,7 +18,7 @@ struct traits > : traits { typedef typename result_of< - ViewOp(typename traits::Scalar) + ViewOp(const typename traits::Scalar&) >::type Scalar; typedef typename MatrixType::Nested MatrixTypeNested; typedef typename remove_all::type _MatrixTypeNested; diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 95bcaa86f..193891189 100755 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -124,7 +124,7 @@ struct member_lpnorm { template struct member_redux { typedef typename result_of< - BinaryOp(Scalar,Scalar) + BinaryOp(const Scalar&,const Scalar&) >::type result_type; template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index e3e6d763d..b01437d88 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -257,7 +257,7 @@ struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; template -struct unary_result_of_select {typedef ArgType type;}; +struct unary_result_of_select {typedef typename internal::remove_all::type type;}; template struct unary_result_of_select {typedef typename Func::result_type type;}; @@ -279,7 +279,7 @@ struct result_of { }; template -struct binary_result_of_select {typedef ArgType0 type;}; +struct binary_result_of_select {typedef typename internal::remove_all::type type;}; template struct binary_result_of_select -- cgit v1.2.3 From b4d87fff4a47797154b661129a0b73d31688d582 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:12:30 +0100 Subject: Fix MSVC warning. --- Eigen/src/Core/MathFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e47070a46..e87b60f8f 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -748,9 +748,9 @@ template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only -EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x); } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } -- cgit v1.2.3 From 9bcadb7fd1d8852dbc74fe054878b0a12f4aed4e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:14:16 +0100 Subject: Disable stupid MSVC warning --- Eigen/src/Core/util/DisableStupidWarnings.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 747232938..91c61fcf2 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -15,10 +15,11 @@ // 4522 - 'class' : multiple assignment operators specified // 4700 - uninitialized local variable 'xyz' used // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow + // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning) #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 ) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) // ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body -- cgit v1.2.3 From df15fbc4520402a00b053bd02c782b77a5b72f61 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 13:16:30 +0100 Subject: bug #1158: PartialReduxExpr is a vector expression, and it thus must expose the LinearAccessBit flag --- Eigen/src/Core/CoreEvaluators.h | 2 +- test/vectorwiseop.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 8bd73b814..7776948d1 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -994,7 +994,7 @@ struct evaluator > CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))), + Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized }; diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 87476f95b..3cc198772 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -210,6 +210,9 @@ template void vectorwiseop_matrix(const MatrixType& m) VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm()); VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm()); + // regression for bug 1158 + VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum()); + // test normalized m2 = m1.colwise().normalized(); VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized()); -- cgit v1.2.3 From f50bb1e6f327055a6113787e9a417bc9c192c4e4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 13:25:26 +0100 Subject: Fix compilation with gcc --- Eigen/src/Core/SpecialFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 21583e6f5..9f89e184d 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -81,7 +81,7 @@ template struct polevl { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar run(const Scalar x, const Scalar coef[]) { - EIGEN_STATIC_ASSERT(N > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); return polevl::run(x, coef) * x + coef[N]; } -- cgit v1.2.3 From c4e47630b16a716d01dc20b36afa8882b03681a1 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 28 Jan 2016 10:35:14 -0800 Subject: benchmark modifications to make it compilable in a standalone fashion. --- bench/tensors/benchmark.h | 48 ++++++++ bench/tensors/benchmark_main.cc | 215 +++++++++++++++++++++++++++++++++ bench/tensors/tensor_benchmarks.h | 87 +++++++------ bench/tensors/tensor_benchmarks_cpu.cc | 20 +-- bench/tensors/tensor_benchmarks_gpu.cc | 4 +- 5 files changed, 318 insertions(+), 56 deletions(-) create mode 100644 bench/tensors/benchmark.h create mode 100644 bench/tensors/benchmark_main.cc diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h new file mode 100644 index 000000000..d8b4fd4c6 --- /dev/null +++ b/bench/tensors/benchmark.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +namespace testing { +class Benchmark { + public: + Benchmark(const char* name, void (*fn)(int)) { + Register(name, fn, NULL); + } + Benchmark(const char* name, void (*fn_range)(int, int)) { + Register(name, NULL, fn_range); + } + Benchmark* Arg(int x); + Benchmark* Range(int lo, int hi); + const char* Name(); + bool ShouldRun(int argc, char* argv[]); + void Run(); + private: + const char* name_; + void (*fn_)(int); + void (*fn_range_)(int, int); + std::vector args_; + void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)); + void RunRepeatedlyWithArg(int iterations, int arg); + void RunWithArg(int arg); +}; +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t); +void StopBenchmarkTiming(); +void StartBenchmarkTiming(); +#define BENCHMARK(f) \ + static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ + (new ::testing::Benchmark(#f, f)) \ No newline at end of file diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc new file mode 100644 index 000000000..0fc12960e --- /dev/null +++ b/bench/tensors/benchmark_main.cc @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "benchmark.h" +#include +#include +#include +#include +#include +#include +#include + +static int64_t g_bytes_processed; +static int64_t g_benchmark_total_time_ns; +static int64_t g_benchmark_start_time_ns; +typedef std::map BenchmarkMap; +typedef BenchmarkMap::iterator BenchmarkMapIt; +static BenchmarkMap g_benchmarks; +static int g_name_column_width = 20; +static int Round(int n) { + int base = 1; + while (base*10 < n) { + base *= 10; + } + if (n < 2*base) { + return 2*base; + } + if (n < 5*base) { + return 5*base; + } + return 10*base; +} +static int64_t NanoTime() { + struct timespec t; + t.tv_sec = t.tv_nsec = 0; + clock_gettime(CLOCK_MONOTONIC, &t); + return static_cast(t.tv_sec) * 1000000000LL + t.tv_nsec; +} +namespace testing { +Benchmark* Benchmark::Arg(int arg) { + args_.push_back(arg); + return this; +} + +Benchmark* Benchmark::Range(int lo, int hi) { + const int kRangeMultiplier = 8; + if (hi < lo) { + int temp = hi; + hi = lo; + lo = temp; + } + while (lo < hi) { + args_.push_back(lo); + lo *= kRangeMultiplier; + } + // We always run the hi number. + args_.push_back(hi); + return this; +} + +const char* Benchmark::Name() { + return name_; +} +bool Benchmark::ShouldRun(int argc, char* argv[]) { + if (argc == 1) { + return true; // With no arguments, we run all benchmarks. + } + // Otherwise, we interpret each argument as a regular expression and + // see if any of our benchmarks match. + for (int i = 1; i < argc; i++) { + regex_t re; + if (regcomp(&re, argv[i], 0) != 0) { + fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]); + exit(EXIT_FAILURE); + } + int match = regexec(&re, name_, 0, NULL, 0); + regfree(&re); + if (match != REG_NOMATCH) { + return true; + } + } + return false; +} +void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) { + name_ = name; + fn_ = fn; + fn_range_ = fn_range; + if (fn_ == NULL && fn_range_ == NULL) { + fprintf(stderr, "%s: missing function\n", name_); + exit(EXIT_FAILURE); + } + g_benchmarks.insert(std::make_pair(name, this)); +} +void Benchmark::Run() { + if (fn_ != NULL) { + RunWithArg(0); + } else { + if (args_.empty()) { + fprintf(stderr, "%s: no args!\n", name_); + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < args_.size(); ++i) { + RunWithArg(args_[i]); + } + } +} +void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { + g_bytes_processed = 0; + g_benchmark_total_time_ns = 0; + g_benchmark_start_time_ns = NanoTime(); + if (fn_ != NULL) { + fn_(iterations); + } else { + fn_range_(iterations, arg); + } + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } +} +void Benchmark::RunWithArg(int arg) { + // run once in case it's expensive + int iterations = 1; + RunRepeatedlyWithArg(iterations, arg); + while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) { + int last = iterations; + if (g_benchmark_total_time_ns/iterations == 0) { + iterations = 1e9; + } else { + iterations = 1e9 / (g_benchmark_total_time_ns/iterations); + } + iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last)); + iterations = Round(iterations); + RunRepeatedlyWithArg(iterations, arg); + } + char throughput[100]; + throughput[0] = '\0'; + if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) { + double mib_processed = static_cast(g_bytes_processed)/1e6; + double seconds = static_cast(g_benchmark_total_time_ns)/1e9; + snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds); + } + char full_name[100]; + if (fn_range_ != NULL) { + if (arg >= (1<<20)) { + snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20)); + } else if (arg >= (1<<10)) { + snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10)); + } else { + snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg); + } + } else { + snprintf(full_name, sizeof(full_name), "%s", name_); + } + printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name, + iterations, g_benchmark_total_time_ns/iterations, throughput); + fflush(stdout); +} +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t x) { + g_bytes_processed = x; +} +void StopBenchmarkTiming() { + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } + g_benchmark_start_time_ns = 0; +} +void StartBenchmarkTiming() { + if (g_benchmark_start_time_ns == 0) { + g_benchmark_start_time_ns = NanoTime(); + } +} +int main(int argc, char* argv[]) { + if (g_benchmarks.empty()) { + fprintf(stderr, "No benchmarks registered!\n"); + exit(EXIT_FAILURE); + } + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + int name_width = static_cast(strlen(it->second->Name())); + g_name_column_width = std::max(g_name_column_width, name_width); + } + bool need_header = true; + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + ::testing::Benchmark* b = it->second; + if (b->ShouldRun(argc, argv)) { + if (need_header) { + printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op"); + fflush(stdout); + need_header = false; + } + b->Run(); + } + } + if (need_header) { + fprintf(stderr, "No matching benchmarks!\n"); + fprintf(stderr, "Available benchmarks:\n"); + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + fprintf(stderr, " %s\n", it->second->Name()); + } + exit(EXIT_FAILURE); + } + return 0; +} \ No newline at end of file diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 525b9acda..a1696afda 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -4,12 +4,23 @@ typedef int TensorIndex; #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "testing/base/public/benchmark.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "benchmark.h" + +#define BENCHMARK_RANGE(bench, lo, hi) \ + BENCHMARK(bench)->Range(lo, hi) + +template +std::string StrCat(const Args... args) { + std::stringstream ss; + StrCatRecursive(ss, args...); + return ss.str(); +} using Eigen::Tensor; using Eigen::TensorMap; +typedef int64_t int64; // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. @@ -43,7 +54,7 @@ template class BenchmarkSuite { void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); @@ -56,16 +67,16 @@ template class BenchmarkSuite { void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - const Eigen::DSizes quarter_sizes(Eigen::array(m_/2, m_/2)); - const Eigen::DSizes first_quadrant(Eigen::array(0, 0)); - const Eigen::DSizes second_quadrant(Eigen::array(0, m_/2)); - const Eigen::DSizes third_quadrant(Eigen::array(m_/2, 0)); - const Eigen::DSizes fourth_quadrant(Eigen::array(m_/2, m_/2)); + const Eigen::DSizes quarter_sizes(m_/2, m_/2); + const Eigen::DSizes first_quadrant(0, 0); + const Eigen::DSizes second_quadrant(0, m_/2); + const Eigen::DSizes third_quadrant(m_/2, 0); + const Eigen::DSizes fourth_quadrant(m_/2, m_/2); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -85,12 +96,12 @@ template class BenchmarkSuite { void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array shuffle(1, 0); + const Eigen::array shuffle = {{1, 0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -102,9 +113,9 @@ template class BenchmarkSuite { void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_-3); + const Eigen::array size_a = {{m_, k_-3}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array, 2> paddings; @@ -121,12 +132,12 @@ template class BenchmarkSuite { void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(m_, k_ / 2); + const Eigen::array size_b = {{m_, k_ / 2}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array strides(1, 2); + const Eigen::array strides = {{1, 2}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -137,14 +148,14 @@ template class BenchmarkSuite { } void broadcasting(int num_iters) { - const Eigen::array size_a(m_, 1); + const Eigen::array size_a = {{m_, 1}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_c(m_, n_); + const Eigen::array size_c = {{m_, n_}}; TensorMap, Eigen::Aligned> C(c_, size_c); -#if defined(__CUDACC__) +#ifndef EIGEN_HAS_INDEX_LIST // nvcc doesn't support cxx11 - const Eigen::array broadcast(1, n_); + const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -162,7 +173,7 @@ template class BenchmarkSuite { void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -178,7 +189,7 @@ template class BenchmarkSuite { void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -194,7 +205,7 @@ template class BenchmarkSuite { void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -210,12 +221,12 @@ template class BenchmarkSuite { // Simple reduction void reduction(int num_iters) { - const Eigen::array input_size(k_, n_); + const Eigen::array input_size = {{k_, n_}}; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size(n_); + const Eigen::array output_size = {{n_}}; TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim(0); + const Eigen::array sum_along_dim = {{0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -228,16 +239,16 @@ template class BenchmarkSuite { // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - const Eigen::array sizeA(m_, k_); - const Eigen::array sizeB(k_, n_); - const Eigen::array sizeC(m_, n_); + const Eigen::array sizeA = {{m_, k_}}; + const Eigen::array sizeB = {{k_, n_}}; + const Eigen::array sizeC = {{m_, n_}}; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; - const Eigen::array dims(DimPair(1, 0)); + const Eigen::array dims = {{DimPair(1, 0)}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -249,14 +260,14 @@ template class BenchmarkSuite { } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array input_sizes(m_, n_); + const Eigen::array input_sizes = {{m_, n_}}; TensorMap, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array kernel_sizes(kernel_x, kernel_y); + const Eigen::array kernel_sizes = {{kernel_x, kernel_y}}; TensorMap, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array result_sizes( - m_ - kernel_x + 1, n_ - kernel_y + 1); + const Eigen::array result_sizes = + {{m_ - kernel_x + 1, n_ - kernel_y + 1}}; TensorMap, Eigen::Aligned> C(c_, result_sizes); - Eigen::array::Index, 2> dims(0, 1); + Eigen::array::Index, 2> dims = {{0, 1}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -280,7 +291,7 @@ template class BenchmarkSuite { device_.memset(b_, 23, k_ * n_ * sizeof(float)); device_.memset(c_, 31, m_ * n_ * sizeof(float)); - BenchmarkUseRealTime(); + //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64 num_items) { @@ -290,7 +301,7 @@ template class BenchmarkSuite { } #endif StopBenchmarkTiming(); - SetBenchmarkItemsProcessed(num_items); + SetBenchmarkBytesProcessed(num_items); } diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 68653ba15..248a63861 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -1,19 +1,12 @@ #define EIGEN_USE_THREADS -#include "base/sysinfo.h" -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" -#include "thread/threadpool.h" +#include + +#include "tensor_benchmarks.h" -#ifdef __ANDROID__ -#define CREATE_THREAD_POOL(threads) \ -Eigen::ThreadPoolDevice device(threads); -#else #define CREATE_THREAD_POOL(threads) \ -ThreadPool tp(threads); \ -tp.StartWorkers(); \ -Eigen::ThreadPoolDevice device(&tp, threads); -#endif +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions #define BM_FuncCPU(FUNC, THREADS) \ @@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); @@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12); BenchmarkSuite suite(device, D1, D2, D3); \ suite.FUNC(iters); \ } \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc index adea754ad..9fe8f84d9 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -3,10 +3,8 @@ #include #include #include -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" - +#include "tensor_benchmarks.h" // Simple functions #define BM_FuncGPU(FUNC) \ -- cgit v1.2.3 From 270c4e1ecd8fd10c42760dd67adbbab0b1387da2 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 28 Jan 2016 11:11:45 -0800 Subject: bugfix --- bench/tensors/benchmark.h | 4 +- bench/tensors/benchmark_main.cc | 21 ++++++---- bench/tensors/tensor_benchmarks.h | 13 ++---- bench/tensors/tensor_benchmarks_gpu.cc | 73 ---------------------------------- bench/tensors/tensor_benchmarks_gpu.cu | 67 +++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 91 deletions(-) delete mode 100644 bench/tensors/tensor_benchmarks_gpu.cc create mode 100644 bench/tensors/tensor_benchmarks_gpu.cu diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h index d8b4fd4c6..2c06075e0 100644 --- a/bench/tensors/benchmark.h +++ b/bench/tensors/benchmark.h @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -45,4 +46,5 @@ void StopBenchmarkTiming(); void StartBenchmarkTiming(); #define BENCHMARK(f) \ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ - (new ::testing::Benchmark(#f, f)) \ No newline at end of file + (new ::testing::Benchmark(#f, f)) + diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index 0fc12960e..b2f457c96 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,8 +28,14 @@ static int64_t g_benchmark_total_time_ns; static int64_t g_benchmark_start_time_ns; typedef std::map BenchmarkMap; typedef BenchmarkMap::iterator BenchmarkMapIt; -static BenchmarkMap g_benchmarks; + +BenchmarkMap& gBenchmarks() { + static BenchmarkMap g_benchmarks; + return g_benchmarks; +} + static int g_name_column_width = 20; + static int Round(int n) { int base = 1; while (base*10 < n) { @@ -101,7 +108,7 @@ void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int fprintf(stderr, "%s: missing function\n", name_); exit(EXIT_FAILURE); } - g_benchmarks.insert(std::make_pair(name, this)); + gBenchmarks().insert(std::make_pair(name, this)); } void Benchmark::Run() { if (fn_ != NULL) { @@ -183,16 +190,16 @@ void StartBenchmarkTiming() { } } int main(int argc, char* argv[]) { - if (g_benchmarks.empty()) { + if (gBenchmarks().empty()) { fprintf(stderr, "No benchmarks registered!\n"); exit(EXIT_FAILURE); } - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { int name_width = static_cast(strlen(it->second->Name())); g_name_column_width = std::max(g_name_column_width, name_width); } bool need_header = true; - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { ::testing::Benchmark* b = it->second; if (b->ShouldRun(argc, argv)) { if (need_header) { @@ -206,10 +213,10 @@ int main(int argc, char* argv[]) { if (need_header) { fprintf(stderr, "No matching benchmarks!\n"); fprintf(stderr, "Available benchmarks:\n"); - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { fprintf(stderr, " %s\n", it->second->Name()); } exit(EXIT_FAILURE); } return 0; -} \ No newline at end of file +} diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index a1696afda..071326aa7 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -10,13 +10,6 @@ typedef int TensorIndex; #define BENCHMARK_RANGE(bench, lo, hi) \ BENCHMARK(bench)->Range(lo, hi) -template -std::string StrCat(const Args... args) { - std::stringstream ss; - StrCatRecursive(ss, args...); - return ss.str(); -} - using Eigen::Tensor; using Eigen::TensorMap; @@ -305,9 +298,9 @@ template class BenchmarkSuite { } - size_t m_; - size_t k_; - size_t n_; + TensorIndex m_; + TensorIndex k_; + TensorIndex n_; float* a_; float* b_; float* c_; diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc deleted file mode 100644 index 9fe8f84d9..000000000 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ /dev/null @@ -1,73 +0,0 @@ -#define EIGEN_USE_GPU - -#include -#include -#include - -#include "tensor_benchmarks.h" - -// Simple functions -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ - Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ - cudaDeviceSynchronize(); \ - suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ - } \ - BENCHMARK_RANGE(BM_##FUNC, 10, 5000); - -BM_FuncGPU(memcpy); -BM_FuncGPU(random); -BM_FuncGPU(slicing); -BM_FuncGPU(shuffling); -BM_FuncGPU(padding); -BM_FuncGPU(striding); -BM_FuncGPU(broadcasting); -BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); - - -// Contractions -#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ - static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ - StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ - Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, D1, D2, D3); \ - cudaDeviceSynchronize(); \ - suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ - } \ - BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); - - -BM_FuncWithInputDimsGPU(contraction, N, N, N); -BM_FuncWithInputDimsGPU(contraction, 64, N, N); -BM_FuncWithInputDimsGPU(contraction, N, 64, N); - - -// Convolutions -#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ - static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ - StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ - Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ - cudaDeviceSynchronize(); \ - suite.FUNC(iters, DIM1, DIM2); \ - cudaStreamDestroy(stream); \ - } \ - BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); - -BM_FuncWithKernelDimsGPU(convolution, 7, 1); -BM_FuncWithKernelDimsGPU(convolution, 1, 7); -BM_FuncWithKernelDimsGPU(convolution, 7, 4); -BM_FuncWithKernelDimsGPU(convolution, 4, 7); -BM_FuncWithKernelDimsGPU(convolution, 7, 64); -BM_FuncWithKernelDimsGPU(convolution, 64, 7); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu new file mode 100644 index 000000000..fbb486efd --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -0,0 +1,67 @@ +#define EIGEN_USE_GPU + +#include +#include +#include + +#include "tensor_benchmarks.h" + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); -- cgit v1.2.3 From c1d900af61561194ad01b16a89da9f9ce1a09723 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 21:43:20 +0100 Subject: bug #178: remove additional const on nested expression, and remove several const_cast. --- Eigen/src/Core/ArrayWrapper.h | 38 +++++++++++++++++++------------------- Eigen/src/Core/Block.h | 31 ++++++++++++------------------- Eigen/src/Core/CwiseUnaryOp.h | 9 +++++---- Eigen/src/Core/CwiseUnaryView.h | 7 ++++--- Eigen/src/Core/Diagonal.h | 14 +++++++------- Eigen/src/Core/SelfAdjointView.h | 6 +++--- Eigen/src/Core/Transpose.h | 10 ++++++---- Eigen/src/Core/Transpositions.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 9 ++++----- Eigen/src/Core/util/XprHelper.h | 10 +++++----- test/array_for_matrix.cpp | 10 ++++++++++ test/diagonal.cpp | 7 +++++++ 12 files changed, 83 insertions(+), 70 deletions(-) diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index 4e484f290..6013d4d85 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector::type NestedExpressionType; + typedef typename internal::ref_selector::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -67,7 +67,7 @@ class ArrayWrapper : public ArrayBase > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -80,13 +80,13 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -98,13 +98,13 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template @@ -116,7 +116,7 @@ class ArrayWrapper : public ArrayBase > template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(rowId, colId, val); + m_expression.template writePacket(rowId, colId, val); } template @@ -128,7 +128,7 @@ class ArrayWrapper : public ArrayBase > template inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(index, val); + m_expression.template writePacket(index, val); } template @@ -145,11 +145,11 @@ class ArrayWrapper : public ArrayBase > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; @@ -195,7 +195,7 @@ class MatrixWrapper : public MatrixBase > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector::type NestedExpressionType; + typedef typename internal::ref_selector::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -210,7 +210,7 @@ class MatrixWrapper : public MatrixBase > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -223,7 +223,7 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -241,13 +241,13 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template @@ -259,7 +259,7 @@ class MatrixWrapper : public MatrixBase > template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(rowId, colId, val); + m_expression.template writePacket(rowId, colId, val); } template @@ -271,7 +271,7 @@ class MatrixWrapper : public MatrixBase > template inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(index, val); + m_expression.template writePacket(index, val); } EIGEN_DEVICE_FUNC @@ -284,11 +284,11 @@ class MatrixWrapper : public MatrixBase > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 599b714cc..cee5591f2 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -221,15 +221,13 @@ template inline PacketScalar packet(Index rowId, Index colId) const { - return m_xpr.template packet - (rowId + m_startRow.value(), colId + m_startCol.value()); + return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket - (rowId + m_startRow.value(), colId + m_startCol.value(), val); + m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template @@ -288,7 +281,7 @@ template inline void writePacket(Index index, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket + m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val); } @@ -320,7 +313,7 @@ template m_startRow; const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 22db783b5..8c182303c 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -58,6 +58,7 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp) + typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC @@ -75,16 +76,16 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type& + const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + typename internal::remove_all::type& + nestedExpression() { return m_xpr; } protected: - typename XprType::Nested m_xpr; + XprTypeNested m_xpr; const UnaryOp m_functor; }; diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index a9252eddf..271033056 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -61,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView) + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type NestedExpression; explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) @@ -75,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::type& + const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_all::type& + typename internal::remove_reference::type& nestedExpression() { return m_matrix.const_cast_derived(); } protected: - typename internal::ref_selector::type m_matrix; + MatrixTypeNested m_matrix; ViewOp m_functor; }; diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index fa3176266..bfea0584b 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -103,21 +103,21 @@ template class Diagonal >::type ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC - inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index) const { - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC @@ -130,13 +130,13 @@ template class Diagonal inline Scalar& coeffRef(Index idx) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index idx) const { - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC @@ -159,7 +159,7 @@ template class Diagonal } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; const internal::variable_if_dynamicindex m_index; private: diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index e709eb213..9fda02691 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -32,7 +32,7 @@ namespace internal { template struct traits > : traits { - typedef typename ref_selector::type MatrixTypeNested; + typedef typename ref_selector::non_const_type MatrixTypeNested; typedef typename remove_all::type MatrixTypeNestedCleaned; typedef MatrixType ExpressionType; typedef typename MatrixType::PlainObject FullMatrixType; @@ -97,7 +97,7 @@ template class SelfAdjointView { EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView); Base::check_coordinates_internal(row, col); - return m_matrix.const_cast_derived().coeffRef(row, col); + return m_matrix.coeffRef(row, col); } /** \internal */ @@ -107,7 +107,7 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; } EIGEN_DEVICE_FUNC - MatrixTypeNestedCleaned& nestedExpression() { return *const_cast(&m_matrix); } + MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; } /** Efficient triangular matrix times vector/matrix product */ template diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index f199d1086..bc232526a 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -54,6 +54,8 @@ template class Transpose { public: + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename TransposeImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose) typedef typename internal::remove_all::type NestedExpression; @@ -68,16 +70,16 @@ template class Transpose /** \returns the nested expression */ EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& + const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; }; namespace internal { diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 678ba3288..19c17bb4a 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -325,7 +325,7 @@ class TranspositionsWrapper protected: - const typename IndicesType::Nested m_indices; + typename IndicesType::Nested m_indices; }; diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 27845e89c..f55b42eed 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -168,7 +168,7 @@ namespace internal { template struct traits > : traits { - typedef typename ref_selector::type MatrixTypeNested; + typedef typename ref_selector::non_const_type MatrixTypeNested; typedef typename remove_reference::type MatrixTypeNestedNonRef; typedef typename remove_all::type MatrixTypeNestedCleaned; typedef typename MatrixType::PlainObject FullMatrixType; @@ -213,7 +213,6 @@ template class TriangularView IsVectorAtCompileTime = false }; - // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} @@ -235,7 +234,7 @@ template class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC - NestedExpression& nestedExpression() { return *const_cast(&m_matrix); } + NestedExpression& nestedExpression() { return m_matrix; } typedef TriangularView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ @@ -255,7 +254,7 @@ template class TriangularView inline TransposeReturnType transpose() { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived()); + typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } @@ -418,7 +417,7 @@ template class TriangularViewImpl<_Mat { EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType); Base::check_coordinates_internal(row, col); - return derived().nestedExpression().const_cast_derived().coeffRef(row, col); + return derived().nestedExpression().coeffRef(row, col); } /** Assigns a triangular matrix to a triangular part of a dense matrix */ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 8b71e2c62..9fe8cfcd1 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -466,17 +466,17 @@ struct special_scalar_op_base : public BaseType template struct special_scalar_op_base : public BaseType { - const CwiseUnaryOp, Derived> + const CwiseUnaryOp, const Derived> operator*(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp, Derived> + return CwiseUnaryOp, const Derived> (*static_cast(this), scalar_multiple2_op(scalar)); } - inline friend const CwiseUnaryOp, Derived> + inline friend const CwiseUnaryOp, const Derived> operator*(const OtherScalar& scalar, const Derived& matrix) { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN @@ -485,13 +485,13 @@ struct special_scalar_op_base : publi return static_cast(matrix).operator*(scalar); } - const CwiseUnaryOp, Derived> + const CwiseUnaryOp, const Derived> operator/(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp, Derived> + return CwiseUnaryOp, const Derived> (*static_cast(this), scalar_quotient2_op(scalar)); } }; diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index 9667e1f14..db5f3b34a 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -68,6 +68,16 @@ template void array_for_matrix(const MatrixType& m) const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0); VERIFY(&ref_a1 == &ref_m1); VERIFY(&ref_a2 == &ref_m2); + + // Check write accessors: + m1.array().coeffRef(0,0) = 1; + VERIFY_IS_APPROX(m1(0,0),Scalar(1)); + m1.array()(0,0) = 2; + VERIFY_IS_APPROX(m1(0,0),Scalar(2)); + m1.array().matrix().coeffRef(0,0) = 3; + VERIFY_IS_APPROX(m1(0,0),Scalar(3)); + m1.array().matrix()(0,0) = 4; + VERIFY_IS_APPROX(m1(0,0),Scalar(4)); } template void comparisons(const MatrixType& m) diff --git a/test/diagonal.cpp b/test/diagonal.cpp index 53814a588..ee00cad55 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -20,6 +20,8 @@ template void diagonal(const MatrixType& m) MatrixType m1 = MatrixType::Random(rows, cols), m2 = MatrixType::Random(rows, cols); + Scalar s1 = internal::random(); + //check diagonal() VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal()); m2.diagonal() = 2 * m1.diagonal(); @@ -58,6 +60,11 @@ template void diagonal(const MatrixType& m) VERIFY_IS_APPROX(m2.template diagonal(), static_cast(2) * m1.diagonal(N2)); m2.diagonal(N2)[0] *= 3; VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast(6) * m1.diagonal(N2)[0]); + + m2.diagonal(N2).x() = s1; + VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1); + m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1; + VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1); } } -- cgit v1.2.3 From b908e071a80fce910efc82c1c50dd6be1e226dcd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 22:11:18 +0100 Subject: bug #178: get rid of some const_cast in SparseCore --- Eigen/src/SparseCore/SparseBlock.h | 37 +++++++++++++--------------- Eigen/src/SparseCore/SparseSelfAdjointView.h | 12 ++++----- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 10be84856..43fe788d9 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -100,11 +100,11 @@ protected: enum { OuterSize = IsRowMajor ? BlockRows : BlockCols }; public: - inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index i) + inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i) : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize) {} - inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols)) {} @@ -112,7 +112,7 @@ public: inline BlockType& operator=(const SparseMatrixBase& other) { typedef typename internal::remove_all::type _NestedMatrixType; - _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);; + _NestedMatrixType& matrix = m_matrix; // This assignment is slow if this vector set is not empty // and/or it is not at the end of the nonzeros of the underlying matrix. @@ -209,28 +209,28 @@ public: inline const Scalar* valuePtr() const { return m_matrix.valuePtr(); } inline Scalar* valuePtr() - { return m_matrix.const_cast_derived().valuePtr(); } + { return m_matrix.valuePtr(); } inline const StorageIndex* innerIndexPtr() const { return m_matrix.innerIndexPtr(); } inline StorageIndex* innerIndexPtr() - { return m_matrix.const_cast_derived().innerIndexPtr(); } + { return m_matrix.innerIndexPtr(); } inline const StorageIndex* outerIndexPtr() const { return m_matrix.outerIndexPtr() + m_outerStart; } inline StorageIndex* outerIndexPtr() - { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; } + { return m_matrix.outerIndexPtr() + m_outerStart; } inline const StorageIndex* innerNonZeroPtr() const { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); } inline StorageIndex* innerNonZeroPtr() - { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); } + { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); } bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; } inline Scalar& coeffRef(Index row, Index col) { - return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); + return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); } inline const Scalar coeff(Index row, Index col) const @@ -264,7 +264,7 @@ public: protected: - typename SparseMatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; Index m_outerStart; const internal::variable_if_dynamic m_outerSize; @@ -373,7 +373,7 @@ public: /** Column or Row constructor */ - inline BlockImpl(const XprType& xpr, Index i) + inline BlockImpl(XprType& xpr, Index i) : m_matrix(xpr), m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0), m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0), @@ -383,7 +383,7 @@ public: /** Dynamic-size constructor */ - inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols)) {} @@ -392,8 +392,7 @@ public: inline Scalar& coeffRef(Index row, Index col) { - return m_matrix.const_cast_derived() - .coeffRef(row + m_startRow.value(), col + m_startCol.value()); + return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value()); } inline const Scalar coeff(Index row, Index col) const @@ -403,16 +402,14 @@ public: inline Scalar& coeffRef(Index index) { - return m_matrix.const_cast_derived() - .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } inline const Scalar coeff(Index index) const { - return m_matrix - .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } @@ -430,7 +427,7 @@ public: EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - typename XprType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; const internal::variable_if_dynamic m_startRow; const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 975cefd28..402733cce 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -55,10 +55,10 @@ template class SparseSelfAdjointView typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix VectorI; - typedef typename MatrixType::Nested MatrixTypeNested; + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; - explicit inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix) + explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix) { eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices"); } @@ -68,7 +68,7 @@ template class SparseSelfAdjointView /** \internal \returns a reference to the nested matrix */ const _MatrixTypeNested& matrix() const { return m_matrix; } - _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); } + typename internal::remove_reference::type& matrix() { return m_matrix; } /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs. * @@ -158,7 +158,7 @@ template class SparseSelfAdjointView protected: - typename MatrixType::Nested m_matrix; + MatrixTypeNested m_matrix; //mutable VectorI m_countPerRow; //mutable VectorI m_countPerCol; private: @@ -194,9 +194,9 @@ SparseSelfAdjointView::rankUpdate(const SparseMatrixBase tmp = u * u.adjoint(); if(alpha==Scalar(0)) - m_matrix.const_cast_derived() = tmp.template triangularView(); + m_matrix = tmp.template triangularView(); else - m_matrix.const_cast_derived() += alpha * tmp.template triangularView(); + m_matrix += alpha * tmp.template triangularView(); return *this; } -- cgit v1.2.3 From acce4dd0500fbb9524fe35aacafb7fbc5f7f76f9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 28 Jan 2016 15:07:26 -0800 Subject: Change Eigen's ColPivHouseholderQR to use the numerically stable norm downdate formula from http://www.netlib.org/lapack/lawnspdf/lawn176.pdf, which has been used in LAPACK's xGEQPF and xGEQP3 since 2006. With the old formula, the code chooses the wrong pivots and fails to correctly determine rank on graded matrices. This change also adds additional checks for non-increasing diagonal in R11 to existing unit tests, and adds a new unit test with the Kahan matrix, which consistently fails for the original code. Benchmark timings on Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz. Code compiled with AVX & FMA. I just ran on square matrices of 3 difference sizes. Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------------- Before: BM_EigencolPivQR/64 53677 53627 12890 BM_EigencolPivQR/512 15265408 15250784 46 BM_EigencolPivQR/4k 15403556228 15388788368 2 After (non-vectorized version): Benchmark Time(ns) CPU(ns) Iterations Degradation -------------------------------------------------------------------- BM_EigencolPivQR/64 63736 63669 10844 18.5% BM_EigencolPivQR/512 16052546 16037381 43 5.1% BM_EigencolPivQR/4k 15149263620 15132025316 2 -2.0% Performance-wise there seems to be a ~18.5% degradation for small (64x64) matrices, probably due to the cost of more O(min(m,n)^2) sqrt operations that are not needed for the unstable formula. --- Eigen/src/QR/ColPivHouseholderQR.h | 121 +++++++++++++++++++++---------------- test/qr_colpivoting.cpp | 86 ++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 52 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index d8bd4b950..61c6fdf09 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -11,7 +11,7 @@ #ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_H #define EIGEN_COLPIVOTINGHOUSEHOLDERQR_H -namespace Eigen { +namespace Eigen { namespace internal { template struct traits > @@ -31,11 +31,11 @@ template struct traits > * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R - * such that + * such that * \f[ * \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R} * \f] - * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an + * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an * upper triangular matrix. * * This decomposition performs column pivoting in order to be rank-revealing and improve @@ -67,11 +67,11 @@ template class ColPivHouseholderQR typedef typename internal::plain_row_type::type RealRowVectorType; typedef HouseholderSequence::type> HouseholderSequenceType; typedef typename MatrixType::PlainObject PlainObject; - + private: - + typedef typename PermutationType::StorageIndex PermIndexType; - + public: /** @@ -86,7 +86,7 @@ template class ColPivHouseholderQR m_colsPermutation(), m_colsTranspositions(), m_temp(), - m_colSqNorms(), + m_colNorms(), m_isInitialized(false), m_usePrescribedThreshold(false) {} @@ -102,7 +102,7 @@ template class ColPivHouseholderQR m_colsPermutation(PermIndexType(cols)), m_colsTranspositions(cols), m_temp(cols), - m_colSqNorms(cols), + m_colNorms(cols), m_isInitialized(false), m_usePrescribedThreshold(false) {} @@ -110,12 +110,12 @@ template class ColPivHouseholderQR * * This constructor computes the QR factorization of the matrix \a matrix by calling * the method compute(). It is a short cut for: - * + * * \code * ColPivHouseholderQR qr(matrix.rows(), matrix.cols()); * qr.compute(matrix); * \endcode - * + * * \sa compute() */ template @@ -125,7 +125,7 @@ template class ColPivHouseholderQR m_colsPermutation(PermIndexType(matrix.cols())), m_colsTranspositions(matrix.cols()), m_temp(matrix.cols()), - m_colSqNorms(matrix.cols()), + m_colNorms(matrix.cols()), m_isInitialized(false), m_usePrescribedThreshold(false) { @@ -160,7 +160,7 @@ template class ColPivHouseholderQR HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const { - return householderQ(); + return householderQ(); } /** \returns a reference to the matrix where the Householder QR decomposition is stored @@ -170,14 +170,14 @@ template class ColPivHouseholderQR eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); return m_qr; } - - /** \returns a reference to the matrix where the result Householder QR is stored - * \warning The strict lower part of this matrix contains internal values. + + /** \returns a reference to the matrix where the result Householder QR is stored + * \warning The strict lower part of this matrix contains internal values. * Only the upper triangular part should be referenced. To get it, use * \code matrixR().template triangularView() \endcode - * For rank-deficient matrices, use - * \code - * matrixR().topLeftCorner(rank(), rank()).template triangularView() + * For rank-deficient matrices, use + * \code + * matrixR().topLeftCorner(rank(), rank()).template triangularView() * \endcode */ const MatrixType& matrixR() const @@ -185,7 +185,7 @@ template class ColPivHouseholderQR eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); return m_qr; } - + template ColPivHouseholderQR& compute(const EigenBase& matrix); @@ -305,9 +305,9 @@ template class ColPivHouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. - * + * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } @@ -380,19 +380,19 @@ template class ColPivHouseholderQR * diagonal coefficient of R. */ RealScalar maxPivot() const { return m_maxpivot; } - + /** \brief Reports whether the QR factorization was succesful. * - * \note This function always returns \c Success. It is provided for compatibility + * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. - * \returns \c Success + * \returns \c Success */ ComputationInfo info() const { eigen_assert(m_isInitialized && "Decomposition is not initialized."); return Success; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -400,20 +400,20 @@ template class ColPivHouseholderQR #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; PermutationType m_colsPermutation; IntRowVectorType m_colsTranspositions; RowVectorType m_temp; - RealRowVectorType m_colSqNorms; + RealRowVectorType m_colNorms; bool m_isInitialized, m_usePrescribedThreshold; RealScalar m_prescribedThreshold, m_maxpivot; Index m_nonzero_pivots; @@ -448,14 +448,14 @@ template ColPivHouseholderQR& ColPivHouseholderQR::compute(const EigenBase& matrix) { check_template_parameters(); - + // the column permutation is stored as int indices, so just to be sure: eigen_assert(matrix.cols()<=NumTraits::highest()); m_qr = matrix; - + computeInPlace(); - + return *this; } @@ -463,10 +463,11 @@ template void ColPivHouseholderQR::computeInPlace() { using std::abs; + Index rows = m_qr.rows(); Index cols = m_qr.cols(); Index size = m_qr.diagonalSize(); - + m_hCoeffs.resize(size); m_temp.resize(cols); @@ -474,31 +475,24 @@ void ColPivHouseholderQR::computeInPlace() m_colsTranspositions.resize(m_qr.cols()); Index number_of_transpositions = 0; - m_colSqNorms.resize(cols); - for(Index k = 0; k < cols; ++k) - m_colSqNorms.coeffRef(k) = m_qr.col(k).squaredNorm(); + m_colNorms.resize(cols); + for (Index k = 0; k < cols; ++k) + m_colNorms.coeffRef(k) = m_qr.col(k).norm(); + RealRowVectorType colNormsMostRecentDirect(m_colNorms); - RealScalar threshold_helper = m_colSqNorms.maxCoeff() * numext::abs2(NumTraits::epsilon()) / RealScalar(rows); + RealScalar threshold_helper = numext::abs2(m_colNorms.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_maxpivot = RealScalar(0); for(Index k = 0; k < size; ++k) { - // first, we look up in our table m_colSqNorms which column has the biggest squared norm + // first, we look up in our table m_colNorms which column has the biggest norm Index biggest_col_index; - RealScalar biggest_col_sq_norm = m_colSqNorms.tail(cols-k).maxCoeff(&biggest_col_index); + RealScalar biggest_col_sq_norm = numext::abs2(m_colNorms.tail(cols-k).maxCoeff(&biggest_col_index)); biggest_col_index += k; - // since our table m_colSqNorms accumulates imprecision at every step, we must now recompute - // the actual squared norm of the selected column. - // Note that not doing so does result in solve() sometimes returning inf/nan values - // when running the unit test with 1000 repetitions. - biggest_col_sq_norm = m_qr.col(biggest_col_index).tail(rows-k).squaredNorm(); - - // we store that back into our table: it can't hurt to correct our table. - m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm; - // Track the number of meaningful pivots but do not stop the decomposition to make // sure that the initial matrix is properly reproduced. See bug 941. if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) @@ -508,7 +502,9 @@ void ColPivHouseholderQR::computeInPlace() m_colsTranspositions.coeffRef(k) = biggest_col_index; if(k != biggest_col_index) { m_qr.col(k).swap(m_qr.col(biggest_col_index)); - std::swap(m_colSqNorms.coeffRef(k), m_colSqNorms.coeffRef(biggest_col_index)); + std::swap(m_colNorms.coeffRef(k), m_colNorms.coeffRef(biggest_col_index)); + std::swap(colNormsMostRecentDirect.coeffRef(k), + colNormsMostRecentDirect.coeffRef(biggest_col_index)); ++number_of_transpositions; } @@ -526,8 +522,29 @@ void ColPivHouseholderQR::computeInPlace() m_qr.bottomRightCorner(rows-k, cols-k-1) .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1)); - // update our table of squared norms of the columns - m_colSqNorms.tail(cols-k-1) -= m_qr.row(k).tail(cols-k-1).cwiseAbs2(); + // update our table of norms of the columns + for (Index j = k + 1; j < cols; ++j) { + // The following implements the stable norm downgrade step discussed in + // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf + // and used in LAPACK routines xGEQPF and xGEQP3. + // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html + if (m_colNorms.coeffRef(j) != 0) { + RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNorms.coeffRef(j); + temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); + temp = temp < 0 ? 0 : temp; + RealScalar temp2 = + temp * numext::abs2(m_colNorms.coeffRef(j) / + colNormsMostRecentDirect.coeffRef(j)); + if (temp2 <= norm_downdate_threshold) { + // The updated norm has become to inaccurate so re-compute the column + // norm directly. + m_colNorms.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm(); + colNormsMostRecentDirect.coeffRef(j) = m_colNorms.coeffRef(j); + } else { + m_colNorms.coeffRef(j) *= numext::sqrt(temp); + } + } + } } m_colsPermutation.setIdentity(PermIndexType(cols)); @@ -578,7 +595,7 @@ struct Assignment >, interna typedef ColPivHouseholderQR QrType; typedef Inverse SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) - { + { dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols())); } }; diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index e7abd3725..7b97292db 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -19,6 +19,7 @@ template void qr() Index rank = internal::random(1, (std::min)(rows, cols)-1); typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; typedef Matrix MatrixQType; MatrixType m1; createRandomPIMatrixOfRank(rank,rows,cols,m1); @@ -36,6 +37,24 @@ template void qr() MatrixType c = q * r * qr.colsPermutation().inverse(); VERIFY_IS_APPROX(m1, c); + // Verify that the absolute value of the diagonal elements in R are + // non-increasing until they reach the singularity threshold. + RealScalar threshold = + std::sqrt(RealScalar(rows)) * (std::abs)(r(0, 0)) * NumTraits::epsilon(); + for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) { + RealScalar x = (std::abs)(r(i, i)); + RealScalar y = (std::abs)(r(i + 1, i + 1)); + if (x < threshold && y < threshold) continue; + if (test_isApproxOrLessThan(x, y)) { + for (Index j = 0; j < (std::min)(rows, cols); ++j) { + std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; + } + std::cout << "Failure at i=" << i << ", rank=" << rank + << ", threshold=" << threshold << std::endl; + } + VERIFY_IS_APPROX_OR_LESS_THAN(y, x); + } + MatrixType m2 = MatrixType::Random(cols,cols2); MatrixType m3 = m1*m2; m2 = MatrixType::Random(cols,cols2); @@ -47,6 +66,7 @@ template void qr_fixedsize() { enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime }; typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; int rank = internal::random(1, (std::min)(int(Rows), int(Cols))-1); Matrix m1; createRandomPIMatrixOfRank(rank,Rows,Cols,m1); @@ -66,6 +86,67 @@ template void qr_fixedsize() m2 = Matrix::Random(Cols,Cols2); m2 = qr.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); + // Verify that the absolute value of the diagonal elements in R are + // non-increasing until they reache the singularity threshold. + RealScalar threshold = + std::sqrt(RealScalar(Rows)) * (std::abs)(r(0, 0)) * NumTraits::epsilon(); + for (Index i = 0; i < (std::min)(int(Rows), int(Cols)) - 1; ++i) { + RealScalar x = (std::abs)(r(i, i)); + RealScalar y = (std::abs)(r(i + 1, i + 1)); + if (x < threshold && y < threshold) continue; + if (test_isApproxOrLessThan(x, y)) { + for (Index j = 0; j < (std::min)(int(Rows), int(Cols)); ++j) { + std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; + } + std::cout << "Failure at i=" << i << ", rank=" << rank + << ", threshold=" << threshold << std::endl; + } + VERIFY_IS_APPROX_OR_LESS_THAN(y, x); + } +} + +// This test is meant to verify that pivots are chosen such that +// even for a graded matrix, the diagonal of R falls of roughly +// monotonically until it reaches the threshold for singularity. +// We use the so-called Kahan matrix, which is a famous counter-example +// for rank-revealing QR. See +// http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +// page 3 for more detail. +template void qr_kahan_matrix() +{ + typedef typename MatrixType::Index Index; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + + Index rows = 300, cols = rows; + + MatrixType m1; + m1.setZero(rows,cols); + RealScalar s = std::pow(NumTraits::epsilon(), 1.0 / rows); + RealScalar c = std::sqrt(1 - s*s); + for (Index i = 0; i < rows; ++i) { + m1(i, i) = pow(s, i); + m1.row(i).tail(rows - i - 1) = -pow(s, i) * c * MatrixType::Ones(1, rows - i - 1); + } + m1 = (m1 + m1.transpose()).eval(); + ColPivHouseholderQR qr(m1); + MatrixType r = qr.matrixQR().template triangularView(); + + RealScalar threshold = + std::sqrt(RealScalar(rows)) * (std::abs)(r(0, 0)) * NumTraits::epsilon(); + for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) { + RealScalar x = (std::abs)(r(i, i)); + RealScalar y = (std::abs)(r(i + 1, i + 1)); + if (x < threshold && y < threshold) continue; + if (test_isApproxOrLessThan(x, y)) { + for (Index j = 0; j < (std::min)(rows, cols); ++j) { + std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; + } + std::cout << "Failure at i=" << i << ", rank=" << qr.rank() + << ", threshold=" << threshold << std::endl; + } + VERIFY_IS_APPROX_OR_LESS_THAN(y, x); + } } template void qr_invertible() @@ -131,6 +212,11 @@ void test_qr_colpivoting() CALL_SUBTEST_5(( qr_fixedsize, 1 >() )); } + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( qr_kahan_matrix() ); + CALL_SUBTEST_2( qr_kahan_matrix() ); + } + for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( qr_invertible() ); CALL_SUBTEST_2( qr_invertible() ); -- cgit v1.2.3 From 7b3044d086f413fdaf65acd30fc3bc469d43ccc6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 15:36:34 -0800 Subject: Made sure to call nvcc with the relaxed-constexpr flag. --- unsupported/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d16c42656..eed724bcf 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -158,7 +158,7 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() - set(CUDA_NVCC_FLAGS "-std=c++11 -arch compute_30") + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") -- cgit v1.2.3 From c8d5f21941a41556f94e937ea5a91badb7fb9353 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:20:36 -0800 Subject: Added extra tensor benchmarks --- bench/tensors/tensor_benchmarks.h | 87 +++++++++++++++++++++++++++++++--- bench/tensors/tensor_benchmarks_cpu.cc | 28 +++++++++-- bench/tensors/tensor_benchmarks_gpu.cu | 7 ++- 3 files changed, 111 insertions(+), 11 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 071326aa7..6b9d13446 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,6 +45,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array sizes = {{m_, k_}}; + const TensorMap, Eigen::Aligned> A(a_, sizes); + TensorMap, Eigen::Aligned> B((int*)b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.cast(); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * k_ * num_iters); + } + void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); const Eigen::array sizes = {{m_, m_}}; @@ -87,6 +101,34 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void rowChip(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + + void colChip(int num_iters) { + const Eigen::array input_size= {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + void shuffling(int num_iters) { eigen_assert(m_ == n_); const Eigen::array size_a = {{m_, k_}}; @@ -147,7 +189,6 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - // nvcc doesn't support cxx11 const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to @@ -212,14 +253,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { + // Row reduction + void rowReduction(int num_iters) { const Eigen::array input_size = {{k_, n_}}; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim = {{0}}; +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim(0); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -227,7 +274,33 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(k_ * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B( + b_, input_size); + const Eigen::array output_size = {{k_}}; + TensorMap, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim = {{1}}; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(k_ * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 248a63861..6754e1a32 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4); BM_FuncCPU(memcpy, 8); BM_FuncCPU(memcpy, 12); +BM_FuncCPU(typeCasting, 4); +BM_FuncCPU(typeCasting, 8); +BM_FuncCPU(typeCasting, 12); + BM_FuncCPU(random, 4); BM_FuncCPU(random, 8); BM_FuncCPU(random, 12); @@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4); BM_FuncCPU(slicing, 8); BM_FuncCPU(slicing, 12); +BM_FuncCPU(rowChip, 4); +BM_FuncCPU(rowChip, 8); +BM_FuncCPU(rowChip, 12); + +BM_FuncCPU(colChip, 4); +BM_FuncCPU(colChip, 8); +BM_FuncCPU(colChip, 12); + BM_FuncCPU(shuffling, 4); BM_FuncCPU(shuffling, 8); BM_FuncCPU(shuffling, 12); @@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4); BM_FuncCPU(transcendentalFunc, 8); BM_FuncCPU(transcendentalFunc, 12); -BM_FuncCPU(reduction, 4); -BM_FuncCPU(reduction, 8); -BM_FuncCPU(reduction, 12); +BM_FuncCPU(rowReduction, 4); +BM_FuncCPU(rowReduction, 8); +BM_FuncCPU(rowReduction, 12); + +BM_FuncCPU(colReduction, 4); +BM_FuncCPU(colReduction, 8); +BM_FuncCPU(colReduction, 12); // Contractions @@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); + BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fbb486efd..fe807d2ab 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -19,6 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(shuffling); @@ -26,7 +27,10 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); // Contractions @@ -45,6 +49,7 @@ BM_FuncGPU(reduction); BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); // Convolutions -- cgit v1.2.3 From a68864b6bce5e00fdec07a9d4dae7376dedb654e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:51:40 -0800 Subject: Updated the benchmarking code to print the number of flops processed instead of the number of bytes. --- bench/tensors/benchmark.h | 3 +-- bench/tensors/benchmark_main.cc | 14 +++++++------- bench/tensors/tensor_benchmarks.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h index 2c06075e0..f115b54ad 100644 --- a/bench/tensors/benchmark.h +++ b/bench/tensors/benchmark.h @@ -41,10 +41,9 @@ class Benchmark { void RunWithArg(int arg); }; } // namespace testing -void SetBenchmarkBytesProcessed(int64_t); +void SetBenchmarkFlopsProcessed(int64_t); void StopBenchmarkTiming(); void StartBenchmarkTiming(); #define BENCHMARK(f) \ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ (new ::testing::Benchmark(#f, f)) - diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index b2f457c96..65dbd89bb 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -23,7 +23,7 @@ #include #include -static int64_t g_bytes_processed; +static int64_t g_flops_processed; static int64_t g_benchmark_total_time_ns; static int64_t g_benchmark_start_time_ns; typedef std::map BenchmarkMap; @@ -124,7 +124,7 @@ void Benchmark::Run() { } } void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { - g_bytes_processed = 0; + g_flops_processed = 0; g_benchmark_total_time_ns = 0; g_benchmark_start_time_ns = NanoTime(); if (fn_ != NULL) { @@ -153,10 +153,10 @@ void Benchmark::RunWithArg(int arg) { } char throughput[100]; throughput[0] = '\0'; - if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) { - double mib_processed = static_cast(g_bytes_processed)/1e6; + if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) { + double mflops_processed = static_cast(g_flops_processed)/1e6; double seconds = static_cast(g_benchmark_total_time_ns)/1e9; - snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds); + snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds); } char full_name[100]; if (fn_range_ != NULL) { @@ -175,8 +175,8 @@ void Benchmark::RunWithArg(int arg) { fflush(stdout); } } // namespace testing -void SetBenchmarkBytesProcessed(int64_t x) { - g_bytes_processed = x; +void SetBenchmarkFlopsProcessed(int64_t x) { + g_flops_processed = x; } void StopBenchmarkTiming() { if (g_benchmark_start_time_ns != 0) { diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 6b9d13446..ba7e7eb48 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -367,7 +367,7 @@ template class BenchmarkSuite { } #endif StopBenchmarkTiming(); - SetBenchmarkBytesProcessed(num_items); + SetBenchmarkFlopsProcessed(num_items); } -- cgit v1.2.3 From 120e13b1b68763f50b30bb83733dcefa9ed966c4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:06:00 -0800 Subject: Added a readme to explain how to compile the tensor benchmarks. --- bench/tensors/README | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 bench/tensors/README diff --git a/bench/tensors/README b/bench/tensors/README new file mode 100644 index 000000000..9e0ac0ce8 --- /dev/null +++ b/bench/tensors/README @@ -0,0 +1,8 @@ +Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. + +To compile the CPU benchmarks, simply call: +g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG-pthread -mavx -o benchmarks_cpu + +To compile the GPU benchmarks, simply call: +nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu + -- cgit v1.2.3 From bd2e5a788ac074535b4f973ac81ac61d4a166288 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:10:40 -0800 Subject: Made sure the number of floating point operations done by a benchmark is computed using 64 bit integers to avoid overflows. --- bench/tensors/tensor_benchmarks.h | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index ba7e7eb48..365504009 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -13,8 +13,6 @@ typedef int TensorIndex; using Eigen::Tensor; using Eigen::TensorMap; -typedef int64_t int64; - // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. template class BenchmarkSuite { @@ -42,7 +40,7 @@ template class BenchmarkSuite { device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); } // Record the number of values copied per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void typeCasting(int num_iters) { @@ -56,7 +54,7 @@ template class BenchmarkSuite { B.device(device_) = A.cast(); } // Record the number of values copied per second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void random(int num_iters) { @@ -69,7 +67,7 @@ template class BenchmarkSuite { C.device(device_) = C.random(); } // Record the number of random numbers generated per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void slicing(int num_iters) { @@ -98,7 +96,7 @@ template class BenchmarkSuite { } // Record the number of values copied from the rhs slice to the lhs slice // each second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void rowChip(int num_iters) { @@ -112,7 +110,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % k_, 0); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void colChip(int num_iters) { @@ -126,7 +124,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % n_, 1); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void shuffling(int num_iters) { @@ -143,7 +141,7 @@ template class BenchmarkSuite { B.device(device_) = A.shuffle(shuffle); } // Record the number of values shuffled from A and copied to B each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void padding(int num_iters) { @@ -162,7 +160,7 @@ template class BenchmarkSuite { B.device(device_) = A.pad(paddings); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void striding(int num_iters) { @@ -179,7 +177,7 @@ template class BenchmarkSuite { B.device(device_) = A.stride(strides); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void broadcasting(int num_iters) { @@ -202,7 +200,7 @@ template class BenchmarkSuite { C.device(device_) = A.broadcast(broadcast); } // Record the number of values broadcasted from A and copied to C each second - finalizeBenchmark(m_ * n_ * num_iters); + finalizeBenchmark(static_cast(m_) * n_ * num_iters); } void coeffWiseOp(int num_iters) { @@ -218,7 +216,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (2 multiplications and // 1 addition per value) - finalizeBenchmark(3 * m_ * m_ * num_iters); + finalizeBenchmark(static_cast(3) * m_ * m_ * num_iters); } void algebraicFunc(int num_iters) { @@ -234,7 +232,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void transcendentalFunc(int num_iters) { @@ -250,7 +248,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } // Row reduction @@ -274,7 +272,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // Column reduction @@ -300,7 +298,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication @@ -322,7 +320,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (size_ multiplications and // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -341,8 +339,8 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) - finalizeBenchmark( - (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + finalizeBenchmark(static_cast(2) * + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: @@ -360,7 +358,7 @@ template class BenchmarkSuite { //BenchmarkUseRealTime(); } - inline void finalizeBenchmark(int64 num_items) { + inline void finalizeBenchmark(int64_t num_items) { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) if (Eigen::internal::is_same::value) { device_.synchronize(); -- cgit v1.2.3 From 211d350fc332a86e5eeb1c9a4ab598756c2eddbf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:13:04 -0800 Subject: Fixed a typo --- bench/tensors/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/README b/bench/tensors/README index 9e0ac0ce8..6b51fe878 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,7 +1,7 @@ Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. To compile the CPU benchmarks, simply call: -g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG-pthread -mavx -o benchmarks_cpu +g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu To compile the GPU benchmarks, simply call: nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu -- cgit v1.2.3 From 3fde202215875568219a7287415385e57f58413f Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 28 Jan 2016 21:27:00 -0500 Subject: Making ceil() functor generic w.r.t packet type --- Eigen/src/Core/functors/UnaryFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 897ab04ba..531beead6 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -666,7 +666,7 @@ struct functor_traits > template struct scalar_ceil_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); } - typedef typename packet_traits::type Packet; + template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); } }; template -- cgit v1.2.3 From d3f533b395e56b740c9f7c6f7272d3384c10222a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 20:09:45 -0800 Subject: Fixed compilation warning --- Eigen/src/Core/GenericPacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 4c7d1d848..8f63af7cb 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -285,7 +285,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template inline void prefetch(const Scalar* addr) +template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { #ifdef __CUDA_ARCH__ #if defined(__LP64__) -- cgit v1.2.3 From 10bea90c4add286b8d10473ba272660ef4210083 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 20:52:08 -0800 Subject: Fixed clang related compilation error --- bench/tensors/tensor_benchmarks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 365504009..f3ec70a9e 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -259,7 +259,7 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array sum_along_dim(0); + const Eigen::array sum_along_dim = {{0}}; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. -- cgit v1.2.3 From e4f83bae5df854319f917fb15ee781f4b960e77c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 21:08:07 -0800 Subject: Fixed the tensor benchmarks on apple devices --- bench/tensors/benchmark_main.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index 65dbd89bb..1efa0dbad 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -49,12 +49,27 @@ static int Round(int n) { } return 10*base; } + +#ifdef __APPLE__ + #include + static mach_timebase_info_data_t g_time_info; + static void __attribute__((constructor)) init_info() { + mach_timebase_info(&g_time_info); + } +#endif + static int64_t NanoTime() { +#if defined(__APPLE__) + uint64_t t = mach_absolute_time(); + return t * g_time_info.numer / g_time_info.denom; +#else struct timespec t; t.tv_sec = t.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t); return static_cast(t.tv_sec) * 1000000000LL + t.tv_nsec; +#endif } + namespace testing { Benchmark* Benchmark::Arg(int arg) { args_.push_back(arg); -- cgit v1.2.3 From c5d25bf1d014f7ef87d55901b591d24a32ee8f4a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 23:15:45 -0800 Subject: Fixed a couple of compilation warnings. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 7a5dfbfea..a03b52629 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -345,7 +345,7 @@ template struct InnerReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); } }; @@ -355,7 +355,7 @@ template struct OuterReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); } }; -- cgit v1.2.3 From 963f2d2a8f33eebf90b3ae1944423aa875281469 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 23:37:48 -0800 Subject: Marked several methods EIGEN_DEVICE_FUNC --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e6a008ba7..1adb68894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase } template - void evalGemv(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -516,7 +516,7 @@ struct TensorEvaluator - void evalProduct(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { this->template evalGemv(buffer); return; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 78ed5038f..3d3f6904f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,7 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -41,9 +41,9 @@ class TensorContractionBlocking { } } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: Index kc_; -- cgit v1.2.3 From d8d37349c3149bffd304057512ee8e6b0f42bc5a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 12:44:49 +0100 Subject: bug #696: enable zero-sized block at compile-time by relaxing the respective assertion --- Eigen/src/Core/Block.h | 4 ++-- test/zerosized.cpp | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index cee5591f2..cf962aed1 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -129,8 +129,8 @@ template class : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) - eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows() - && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols()); + eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows() + && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols()); } /** Dynamic-size constructor diff --git a/test/zerosized.cpp b/test/zerosized.cpp index da7dd0481..85c553453 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -25,6 +25,7 @@ template void zeroReduction(const MatrixType& m) { template void zeroSizedMatrix() { MatrixType t1; + typedef typename MatrixType::Scalar Scalar; if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0) { @@ -45,6 +46,23 @@ template void zeroSizedMatrix() VERIFY(t1==t2); } } + + if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0) + { + Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::RowsAtCompileTime; + Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::ColsAtCompileTime; + MatrixType m(rows,cols); + zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols)); + zeroReduction(m.template block(0,0,rows,0)); + zeroReduction(m.template block<0,1>(0,0)); + zeroReduction(m.template block<1,0>(0,0)); + Matrix prod = m.template block(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols); + VERIFY(prod.rows()==rows && prod.cols()==cols); + VERIFY(prod.isZero()); + prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0); + VERIFY(prod.size()==1); + VERIFY(prod.isZero()); + } } template void zeroSizedVector() -- cgit v1.2.3 From d4a9e615699bd7f26864d57d2b28021b9f64b6ff Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 22:07:56 +0100 Subject: Extend SparseView to allow keeping explicit zeros. This is equivalent to sparseView(1,-1) but faster because the test is removed at compile-time. --- Eigen/src/Core/util/ForwardDeclarations.h | 2 +- Eigen/src/SparseCore/SparseUtil.h | 1 - Eigen/src/SparseCore/SparseView.h | 30 +++++++++++++++--------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 483af876f..e6ed965ca 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -126,7 +126,7 @@ template class TriangularBase; template class TriangularView; template class SelfAdjointView; -template class SparseView; +template class SparseView; template class WithFormat; template struct CommaInitializer; template class ReturnByValue; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 74df0d496..3b1cf03ab 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -56,7 +56,6 @@ template class template class SparseSelfAdjointView; template class SparseDiagonalProduct; -template class SparseView; template class SparseSparseProduct; template class SparseTimeDenseProduct; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index b867877d8..f2af1b5d9 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011-2014 Gael Guennebaud +// Copyright (C) 2011-2015 Gael Guennebaud // Copyright (C) 2010 Daniel Lowengrub // // This Source Code Form is subject to the terms of the Mozilla @@ -15,8 +15,8 @@ namespace Eigen { namespace internal { -template -struct traits > : traits +template +struct traits > : traits { typedef typename MatrixType::StorageIndex StorageIndex; typedef Sparse StorageKind; @@ -27,8 +27,8 @@ struct traits > : traits } // end namespace internal -template -class SparseView : public SparseMatrixBase > +template +class SparseView : public SparseMatrixBase > { typedef typename MatrixType::Nested MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; @@ -66,13 +66,13 @@ namespace internal { // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is // not easy because the evaluators do not expose the sizes of the underlying expression. -template -struct unary_evaluator, IteratorBased> - : public evaluator_base > +template +struct unary_evaluator, IteratorBased> + : public evaluator_base > { typedef typename evaluator::InnerIterator EvalIterator; public: - typedef SparseView XprType; + typedef SparseView XprType; class InnerIterator : public EvalIterator { @@ -88,7 +88,7 @@ struct unary_evaluator, IteratorBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { EvalIterator::operator++(); - incrementToNonZero(); + if(!KeepZeros) incrementToNonZero(); return *this; } @@ -119,12 +119,12 @@ struct unary_evaluator, IteratorBased> const XprType &m_view; }; -template -struct unary_evaluator, IndexBased> - : public evaluator_base > +template +struct unary_evaluator, IndexBased> + : public evaluator_base > { public: - typedef SparseView XprType; + typedef SparseView XprType; protected: enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit }; typedef typename XprType::Scalar Scalar; @@ -144,7 +144,7 @@ struct unary_evaluator, IndexBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; - incrementToNonZero(); + if(!KeepZeros) incrementToNonZero(); return *this; } -- cgit v1.2.3 From 15084cf1ac1f58085cd0635676aa1d28efb268de Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 22:09:45 +0100 Subject: bug #632: add support for "dense +/- sparse" operations. The current implementation is based on SparseView to make the dense subexpression compatible with the sparse one. --- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 30 ++++++++++++++++++++++++++++++ test/sparse_basic.cpp | 5 +++++ 2 files changed, 35 insertions(+) diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index d9420ac63..06c6d0e4d 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -60,6 +60,8 @@ namespace internal { // Generic "sparse OP sparse" +template struct binary_sparse_evaluator; + template struct binary_evaluator, IteratorBased, IteratorBased> : evaluator_base > @@ -428,6 +430,34 @@ SparseMatrixBase::cwiseProduct(const MatrixBase &other) c return typename CwiseProductDenseReturnType::Type(derived(), other.derived()); } +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > +operator+(const SparseMatrixBase &a, const MatrixBase &b) +{ + return a.derived() + SparseView(b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +operator+(const MatrixBase &a, const SparseMatrixBase &b) +{ + return SparseView(a.derived()) + b.derived(); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > +operator-(const SparseMatrixBase &a, const MatrixBase &b) +{ + return a.derived() - SparseView(b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +operator-(const MatrixBase &a, const SparseMatrixBase &b) +{ + return SparseView(a.derived()) - b.derived(); +} + } // end namespace Eigen #endif // EIGEN_SPARSE_CWISE_BINARY_OP_H diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index d803e7dae..5a5650705 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -192,6 +192,11 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3)); // VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4); + VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3); + VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4); + VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3); + VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4); + // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1)); VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval())); -- cgit v1.2.3 From 699634890afdce914553862464450966ead40ad0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 23:02:22 +0100 Subject: bug #946: generalize Cholmod::solve to handle any rhs expression --- Eigen/src/CholmodSupport/CholmodSupport.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index c7c521b95..b8020a92c 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -273,9 +273,10 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); + + // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref. + Ref > b_ref(b.derived()); - // note: cd stands for Cholmod Dense - Rhs& b_ref(b.const_cast_derived()); cholmod_dense b_cd = viewAsCholmod(b_ref); cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod); if(!x_cd) -- cgit v1.2.3 From 8ed1553d20c3837d6365e1a87f6ed11570fc7a26 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:39:50 +0100 Subject: bug #632: implement general coefficient-wise "dense op sparse" operations through specialized evaluators instead of using SparseView. This permits to deal with arbitrary storage order, and to by-pass the more complex iterator of the sparse-sparse case. --- Eigen/src/Core/util/XprHelper.h | 23 ++- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 219 +++++++++++++++++++++++++---- 2 files changed, 205 insertions(+), 37 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 9fe8cfcd1..a001c473a 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -526,22 +526,21 @@ template struct promote_storage_type * the functor. * The default rules are as follows: * \code - * A op A -> A - * A op dense -> dense - * dense op B -> dense - * A * dense -> A - * dense * B -> B + * A op A -> A + * A op dense -> dense + * dense op B -> dense + * sparse op dense -> sparse + * dense op sparse -> sparse * \endcode */ template struct cwise_promote_storage_type; -template struct cwise_promote_storage_type { typedef A ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type > { typedef Dense ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type > { typedef A ret; }; -template struct cwise_promote_storage_type > { typedef B ret; }; +template struct cwise_promote_storage_type { typedef A ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B. * The template parameter ProductTag permits to specialize the resulting storage kind wrt to diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 06c6d0e4d..c57d9ac59 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -49,15 +49,6 @@ class CwiseBinaryOpImpl namespace internal { -template::StorageKind, - typename _RhsStorageMode = typename traits::StorageKind> -class sparse_cwise_binary_op_inner_iterator_selector; - -} // end namespace internal - -namespace internal { - // Generic "sparse OP sparse" template struct binary_sparse_evaluator; @@ -155,6 +146,182 @@ protected: evaluator m_rhsImpl; }; +// dense op sparse +template +struct binary_evaluator, IndexBased, IteratorBased> + : evaluator_base > +{ +protected: + typedef typename evaluator::InnerIterator RhsIterator; + typedef CwiseBinaryOp XprType; + typedef typename traits::Scalar Scalar; + typedef typename XprType::StorageIndex StorageIndex; +public: + + class ReverseInnerIterator; + class InnerIterator + { + enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit }; + public: + + EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer) + : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize()) + { + this->operator++(); + } + + EIGEN_STRONG_INLINE InnerIterator& operator++() + { + ++m_id; + if(m_id &m_lhsEval; + RhsIterator m_rhsIter; + const BinaryOp& m_functor; + Scalar m_value; + StorageIndex m_id; + StorageIndex m_innerSize; + }; + + + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) + }; + + explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()), + m_expr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + inline Index nonZerosEstimate() const { + return m_expr.size(); + } + +protected: + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; + const XprType &m_expr; +}; + +// sparse op dense +template +struct binary_evaluator, IteratorBased, IndexBased> + : evaluator_base > +{ +protected: + typedef typename evaluator::InnerIterator LhsIterator; + typedef CwiseBinaryOp XprType; + typedef typename traits::Scalar Scalar; + typedef typename XprType::StorageIndex StorageIndex; +public: + + class ReverseInnerIterator; + class InnerIterator + { + enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit }; + public: + + EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer) + : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize()) + { + this->operator++(); + } + + EIGEN_STRONG_INLINE InnerIterator& operator++() + { + ++m_id; + if(m_id &m_rhsEval; + const BinaryOp& m_functor; + Scalar m_value; + StorageIndex m_id; + StorageIndex m_innerSize; + }; + + + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) + }; + + explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()), + m_expr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + inline Index nonZerosEstimate() const { + return m_expr.size(); + } + +protected: + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; + const XprType &m_expr; +}; + // "sparse .* sparse" template struct binary_evaluator, Lhs, Rhs>, IteratorBased, IteratorBased> @@ -289,7 +456,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -362,7 +530,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -430,32 +599,32 @@ SparseMatrixBase::cwiseProduct(const MatrixBase &other) c return typename CwiseProductDenseReturnType::Type(derived(), other.derived()); } -template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > -operator+(const SparseMatrixBase &a, const MatrixBase &b) -{ - return a.derived() + SparseView(b.derived()); -} - template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const DenseDerived, const SparseDerived> operator+(const MatrixBase &a, const SparseMatrixBase &b) { - return SparseView(a.derived()) + b.derived(); + return CwiseBinaryOp, const DenseDerived, const SparseDerived>(a.derived(), b.derived()); } template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > -operator-(const SparseMatrixBase &a, const MatrixBase &b) +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const DenseDerived> +operator+(const SparseMatrixBase &a, const MatrixBase &b) { - return a.derived() - SparseView(b.derived()); + return CwiseBinaryOp, const SparseDerived, const DenseDerived>(a.derived(), b.derived()); } template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const DenseDerived, const SparseDerived> operator-(const MatrixBase &a, const SparseMatrixBase &b) { - return SparseView(a.derived()) - b.derived(); + return CwiseBinaryOp, const DenseDerived, const SparseDerived>(a.derived(), b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const DenseDerived> +operator-(const SparseMatrixBase &a, const MatrixBase &b) +{ + return CwiseBinaryOp, const SparseDerived, const DenseDerived>(a.derived(), b.derived()); } } // end namespace Eigen -- cgit v1.2.3 From 1bc207c528bcfc4d9fb27ada28a8aaf1b9e8d3f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:43:21 +0100 Subject: backout changeset d4a9e615699bd7f26864d57d2b28021b9f64b6ff : the extended SparseView is not needed anymore --- Eigen/src/Core/util/ForwardDeclarations.h | 2 +- Eigen/src/SparseCore/SparseUtil.h | 1 + Eigen/src/SparseCore/SparseView.h | 30 +++++++++++++++--------------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index e6ed965ca..483af876f 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -126,7 +126,7 @@ template class TriangularBase; template class TriangularView; template class SelfAdjointView; -template class SparseView; +template class SparseView; template class WithFormat; template struct CommaInitializer; template class ReturnByValue; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 3b1cf03ab..74df0d496 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -56,6 +56,7 @@ template class template class SparseSelfAdjointView; template class SparseDiagonalProduct; +template class SparseView; template class SparseSparseProduct; template class SparseTimeDenseProduct; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index f2af1b5d9..b867877d8 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011-2015 Gael Guennebaud +// Copyright (C) 2011-2014 Gael Guennebaud // Copyright (C) 2010 Daniel Lowengrub // // This Source Code Form is subject to the terms of the Mozilla @@ -15,8 +15,8 @@ namespace Eigen { namespace internal { -template -struct traits > : traits +template +struct traits > : traits { typedef typename MatrixType::StorageIndex StorageIndex; typedef Sparse StorageKind; @@ -27,8 +27,8 @@ struct traits > : traits } // end namespace internal -template -class SparseView : public SparseMatrixBase > +template +class SparseView : public SparseMatrixBase > { typedef typename MatrixType::Nested MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; @@ -66,13 +66,13 @@ namespace internal { // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is // not easy because the evaluators do not expose the sizes of the underlying expression. -template -struct unary_evaluator, IteratorBased> - : public evaluator_base > +template +struct unary_evaluator, IteratorBased> + : public evaluator_base > { typedef typename evaluator::InnerIterator EvalIterator; public: - typedef SparseView XprType; + typedef SparseView XprType; class InnerIterator : public EvalIterator { @@ -88,7 +88,7 @@ struct unary_evaluator, IteratorBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { EvalIterator::operator++(); - if(!KeepZeros) incrementToNonZero(); + incrementToNonZero(); return *this; } @@ -119,12 +119,12 @@ struct unary_evaluator, IteratorBased> const XprType &m_view; }; -template -struct unary_evaluator, IndexBased> - : public evaluator_base > +template +struct unary_evaluator, IndexBased> + : public evaluator_base > { public: - typedef SparseView XprType; + typedef SparseView XprType; protected: enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit }; typedef typename XprType::Scalar Scalar; @@ -144,7 +144,7 @@ struct unary_evaluator, IndexBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; - if(!KeepZeros) incrementToNonZero(); + incrementToNonZero(); return *this; } -- cgit v1.2.3 From 102fa96a9610ccee4f246f8c1030c0bdc380a429 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:58:21 +0100 Subject: Extend doc on dense+sparse --- doc/TutorialSparse.dox | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index fb07adaa2..1f0be387d 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -257,7 +257,14 @@ Binary coefficient wise operators can also mix sparse and dense expressions: \code sm2 = sm1.cwiseProduct(dm1); dm2 = sm1 + dm1; +dm2 = dm1 - sm1; \endcode +Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing dm2 = sm1 + dm1, better write: +\code +dm2 = dm1; +dm2 += sm1; +\endcode +This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only. %Sparse expressions also support transposition: -- cgit v1.2.3 From 4281eb1e2c3c2fc563aad9625e90d7b3e6fe1e75 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:20:43 -0800 Subject: Added 2 benchmarks to the suite of tensor benchmarks running on GPU --- bench/tensors/tensor_benchmarks_gpu.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fe807d2ab..611e8197b 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -22,6 +22,8 @@ BM_FuncGPU(memcpy); BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); BM_FuncGPU(shuffling); BM_FuncGPU(padding); BM_FuncGPU(striding); -- cgit v1.2.3 From ba27c8a7ded7bd766d724ab74d5730d4195d6722 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:28:43 -0800 Subject: Made the CUDA contract test more robust to numerical noise. --- unsupported/test/cxx11_tensor_contract_cuda.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index ac447dd7b..cbd902d6a 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -24,7 +24,7 @@ typedef Tensor::DimensionPair DimPair; template static void test_cuda_contraction(int m_size, int k_size, int n_size) { - std::cout<<"Calling with ("<= 1e-4) { - std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << std::endl; - assert(false); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); } cudaFree((void*)d_t_left); @@ -83,7 +87,7 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { - std::cout<<"Calling contraction tests"<(128, 128, 128)); CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { -- cgit v1.2.3 From d0db95f730b84e59bbad7fce24eb4becef106b9e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:43:57 -0800 Subject: Sharded the tensor thread pool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 41 +++++++++++++-------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e28cf55e2..8ee644ecc 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,7 +17,7 @@ using Eigen::Tensor; -static void test_multithread_elementwise() +void test_multithread_elementwise() { Tensor in1(2,3,7); Tensor in2(2,3,7); @@ -40,7 +40,7 @@ static void test_multithread_elementwise() } -static void test_multithread_compound_assignment() +void test_multithread_compound_assignment() { Tensor in1(2,3,7); Tensor in2(2,3,7); @@ -64,7 +64,7 @@ static void test_multithread_compound_assignment() } template -static void test_multithread_contraction() +void test_multithread_contraction() { Tensor t_left(30, 50, 37, 31); Tensor t_right(37, 31, 70, 2, 10); @@ -99,7 +99,7 @@ static void test_multithread_contraction() } template -static void test_contraction_corner_cases() +void test_contraction_corner_cases() { Tensor t_left(32, 500); Tensor t_right(32, 28*28); @@ -186,7 +186,7 @@ static void test_contraction_corner_cases() } template -static void test_multithread_contraction_agrees_with_singlethread() { +void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(1, 5000); Tensor left(internal::random(1, 80), @@ -229,7 +229,7 @@ static void test_multithread_contraction_agrees_with_singlethread() { template -static void test_multithreaded_reductions() { +void test_multithreaded_reductions() { const int num_threads = internal::random(3, 11); ThreadPool thread_pool(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); @@ -251,7 +251,7 @@ static void test_multithreaded_reductions() { } -static void test_memcpy() { +void test_memcpy() { for (int i = 0; i < 5; ++i) { const int num_threads = internal::random(3, 11); @@ -270,7 +270,7 @@ static void test_memcpy() { } -static void test_multithread_random() +void test_multithread_random() { Eigen::ThreadPool tp(2); Eigen::ThreadPoolDevice device(&tp, 2); @@ -281,23 +281,22 @@ static void test_multithread_random() void test_cxx11_tensor_thread_pool() { - CALL_SUBTEST(test_multithread_elementwise()); - CALL_SUBTEST(test_multithread_compound_assignment()); + CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST_2(test_multithread_contraction()); + CALL_SUBTEST_2(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases()); - CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_contraction_corner_cases()); - CALL_SUBTEST(test_multithreaded_reductions()); - CALL_SUBTEST(test_multithreaded_reductions()); + CALL_SUBTEST_5(test_multithreaded_reductions()); + CALL_SUBTEST_5(test_multithreaded_reductions()); - CALL_SUBTEST(test_memcpy()); - - CALL_SUBTEST(test_multithread_random()); + CALL_SUBTEST_6(test_memcpy()); + CALL_SUBTEST_6(test_multithread_random()); } -- cgit v1.2.3 From 2053478c5677b53c87b302d962a0545d08833a72 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:46:36 -0800 Subject: Made sure to use a tensor of rank 0 to store the result of a full reduction in the tensor thread pool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 8ee644ecc..8e5636bb7 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -239,15 +239,15 @@ void test_multithreaded_reductions() { Tensor t1(num_rows, num_cols); t1.setRandom(); - Tensor full_redux(1); + Tensor full_redux; full_redux = t1.sum(); - Tensor full_redux_tp(1); + Tensor full_redux_tp; full_redux_tp.device(thread_pool_device) = t1.sum(); // Check that the single threaded and the multi threaded reductions return // the same result. - VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0)); + VERIFY_IS_APPROX(full_redux(), full_redux_tp()); } -- cgit v1.2.3 From 32088c06a169ed8d1286c491ed21a20321ae58a5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:51:14 -0800 Subject: Made the comparison between single and multithreaded contraction results more resistant to numerical noise to prevent spurious test failures. --- unsupported/test/cxx11_tensor_thread_pool.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 8e5636bb7..aa76009b7 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -91,10 +91,15 @@ void test_multithread_contraction() for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; - assert(false); + if (fabs(t_result(i) - m_result(i)) < 1e-4) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { + continue; } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << m_result(i) << std::endl; + assert(false); } } -- cgit v1.2.3 From 9de155d15320a68182e7f572adf504cad6172419 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:56:47 -0800 Subject: Added a test to cover threaded tensor shuffling --- unsupported/test/cxx11_tensor_thread_pool.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index aa76009b7..e46197464 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -283,6 +283,31 @@ void test_multithread_random() t.device(device) = t.random>(); } +template +void test_multithread_shuffle() +{ + Tensor tensor(17,5,7,11); + tensor.setRandom(); + + const int num_threads = internal::random(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor shuffle(7,5,11,17); + array shuffles = {{2,1,3,0}}; + shuffle.device(device) = tensor.shuffle(shuffles); + + for (int i = 0; i < 17; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); + } + } + } + } +} + void test_cxx11_tensor_thread_pool() { @@ -304,4 +329,6 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_6(test_memcpy()); CALL_SUBTEST_6(test_multithread_random()); + CALL_SUBTEST_6(test_multithread_shuffle()); + CALL_SUBTEST_6(test_multithread_shuffle()); } -- cgit v1.2.3 From bd21aba1817f76f4e72ddf3c55ef23d4a62ed6f7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 11:47:09 -0800 Subject: Sharded the cxx11_tensor_cuda test and fixed a memory leak --- unsupported/test/cxx11_tensor_cuda.cu | 131 +++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 43 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 79f1c5315..60f9314a5 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -63,6 +63,10 @@ void test_cuda_elementwise_small() { out(Eigen::array(i)), in1(Eigen::array(i)) + in2(Eigen::array(i))); } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); } void test_cuda_elementwise() @@ -113,6 +117,11 @@ void test_cuda_elementwise() } } } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); } void test_cuda_reduction() @@ -158,10 +167,13 @@ void test_cuda_reduction() VERIFY_IS_APPROX(out(i,j), expected); } } + + cudaFree(d_in1); + cudaFree(d_out); } template -static void test_cuda_contraction() +void test_cuda_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -216,10 +228,14 @@ static void test_cuda_contraction() assert(false); } } + + cudaFree(d_t_left); + cudaFree(d_t_right); + cudaFree(d_t_result); } template -static void test_cuda_convolution_1d() +void test_cuda_convolution_1d() { Tensor input(74,37,11,137); Tensor kernel(4); @@ -266,9 +282,13 @@ static void test_cuda_convolution_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_col_major_1d() +void test_cuda_convolution_inner_dim_col_major_1d() { Tensor input(74,9,11,7); Tensor kernel(4); @@ -315,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_row_major_1d() +void test_cuda_convolution_inner_dim_row_major_1d() { Tensor input(7,9,11,74); Tensor kernel(4); @@ -364,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template -static void test_cuda_convolution_2d() +void test_cuda_convolution_2d() { Tensor input(74,37,11,137); Tensor kernel(3,4); @@ -424,10 +452,14 @@ static void test_cuda_convolution_2d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template -static void test_cuda_convolution_3d() +void test_cuda_convolution_3d() { Tensor input(Eigen::array(74,37,11,137,17)); Tensor kernel(3,4,2); @@ -498,6 +530,10 @@ static void test_cuda_convolution_3d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } @@ -535,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template @@ -571,6 +610,9 @@ void test_cuda_erf(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template @@ -607,47 +649,50 @@ void test_cuda_erfc(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_elementwise_small()); - CALL_SUBTEST(test_cuda_elementwise()); - CALL_SUBTEST(test_cuda_reduction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_lgamma(1.0f)); - CALL_SUBTEST(test_cuda_lgamma(100.0f)); - CALL_SUBTEST(test_cuda_lgamma(0.01f)); - CALL_SUBTEST(test_cuda_lgamma(0.001f)); - CALL_SUBTEST(test_cuda_erf(1.0f)); - CALL_SUBTEST(test_cuda_erf(100.0f)); - CALL_SUBTEST(test_cuda_erf(0.01f)); - CALL_SUBTEST(test_cuda_erf(0.001f)); - CALL_SUBTEST(test_cuda_erfc(1.0f)); + CALL_SUBTEST_1(test_cuda_elementwise_small()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_reduction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); + CALL_SUBTEST_4(test_cuda_erf(100.0f)); + CALL_SUBTEST_4(test_cuda_erf(0.01f)); + CALL_SUBTEST_4(test_cuda_erf(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); // CALL_SUBTEST(test_cuda_erfc(100.0f)); - CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01f)); - CALL_SUBTEST(test_cuda_erfc(0.001f)); - CALL_SUBTEST(test_cuda_lgamma(1.0)); - CALL_SUBTEST(test_cuda_lgamma(100.0)); - CALL_SUBTEST(test_cuda_lgamma(0.01)); - CALL_SUBTEST(test_cuda_lgamma(0.001)); - CALL_SUBTEST(test_cuda_erf(1.0)); - CALL_SUBTEST(test_cuda_erf(100.0)); - CALL_SUBTEST(test_cuda_erf(0.01)); - CALL_SUBTEST(test_cuda_erf(0.001)); - CALL_SUBTEST(test_cuda_erfc(1.0)); + CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01f)); + CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + CALL_SUBTEST_4(test_cuda_erf(1.0)); + CALL_SUBTEST_4(test_cuda_erf(100.0)); + CALL_SUBTEST_4(test_cuda_erf(0.01)); + CALL_SUBTEST_4(test_cuda_erf(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(1.0)); // CALL_SUBTEST(test_cuda_erfc(100.0)); - CALL_SUBTEST(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01)); - CALL_SUBTEST(test_cuda_erfc(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01)); + CALL_SUBTEST_4(test_cuda_erfc(0.001)); } -- cgit v1.2.3 From 483082ef6e4ff25d43cba03e1b1f2ed15000ac3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 11:59:22 -0800 Subject: Fixed a few memory leaks in the cuda tests --- unsupported/test/cxx11_tensor_argmax_cuda.cu | 10 ++++++++++ unsupported/test/cxx11_tensor_reduction_cuda.cu | 3 +++ 2 files changed, 13 insertions(+) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index d37490d15..48cec510d 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -56,6 +56,10 @@ void test_cuda_simple_argmax() VERIFY_IS_EQUAL(out_max(Eigen::array(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array(0)), 0); + + cudaFree(d_in); + cudaFree(d_out_max); + cudaFree(d_out_min); } template @@ -141,6 +145,9 @@ void test_cuda_argmax_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } @@ -227,6 +234,9 @@ void test_cuda_argmin_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 9e06eb126..417242586 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -48,6 +48,9 @@ static void test_full_reductions() { // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); } void test_cxx11_tensor_reduction_cuda() { -- cgit v1.2.3 From 3ba8a3ab1a77fd2f58448187c6881d13bf51f430 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 22:14:04 +0100 Subject: Disable underflow unit test on the i387 FPU. --- test/adjoint.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index b1e69c2e5..9c895e0ac 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -45,12 +45,14 @@ template<> struct adjoint_specific { // check null inputs VERIFY_IS_APPROX((v1*0).normalized(), (v1*0)); +#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE) RealScalar very_small = (std::numeric_limits::min)(); VERIFY( (v1*very_small).norm() == 0 ); VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small)); v3 = v1*very_small; v3.normalize(); VERIFY_IS_APPROX(v3, (v1*very_small)); +#endif // check compatibility of dot and adjoint ref = NumTraits::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm())); -- cgit v1.2.3 From a4e4542b89092eb2ed2984aae6f15bbcc43d7ed6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 22:26:17 +0100 Subject: Avoid overflow in unit test. --- test/stable_norm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 9f12320e0..c3eb5ff31 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -174,7 +174,8 @@ template void stable_norm(const MatrixType& m) VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1)); VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1)); VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1)); - VERIFY_IS_APPROX(vbig, vbig.stableNorm() * vbig.stableNormalized()); + RealScalar big_scaling = ((std::numeric_limits::max)() * RealScalar(1e-4)); + VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling); VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized()); } } -- cgit v1.2.3 From d1421659427a5bb237bbba1c86781267b98ce235 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 31 Jan 2016 16:34:10 +0100 Subject: bug #667: declare several critical functions as FORECE_INLINE to make ICC happier. HG: branch 'default' HG: changed Eigen/src/Core/ArrayBase.h HG: changed Eigen/src/Core/AssignEvaluator.h HG: changed Eigen/src/Core/CoreEvaluators.h HG: changed Eigen/src/Core/CwiseUnaryOp.h HG: changed Eigen/src/Core/DenseBase.h HG: changed Eigen/src/Core/MatrixBase.h --- Eigen/src/Core/ArrayBase.h | 16 ++-- Eigen/src/Core/AssignEvaluator.h | 34 ++++--- Eigen/src/Core/CoreEvaluators.h | 197 +++++++++++++++++++++++++++------------ Eigen/src/Core/CwiseUnaryOp.h | 16 ++-- Eigen/src/Core/DenseBase.h | 8 +- Eigen/src/Core/MatrixBase.h | 8 +- 6 files changed, 183 insertions(+), 96 deletions(-) diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index b4c24a27a..0443e3032 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -103,7 +103,7 @@ template class ArrayBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const ArrayBase& other) { internal::call_assignment(derived(), other.derived()); @@ -112,28 +112,28 @@ template class ArrayBase /** Set all the entries to \a value. * \sa DenseBase::setConstant(), DenseBase::fill() */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Scalar &value) { Base::setConstant(value); return derived(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& scalar); - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& scalar); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase& other); public: diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f6632de69..5b65bfb0c 100755 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -637,7 +637,7 @@ protected: ***************************************************************************/ template -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -654,7 +654,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S } template -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) { call_dense_assignment_loop(dst, src, internal::assign_op()); } @@ -688,26 +688,30 @@ struct Assignment; // does not has to bother about these annoying details. template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } template -EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } // Deal with "assume-aliasing" template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) { typename plain_matrix_type::type tmp(src); call_assignment_no_alias(dst, tmp, func); } template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) { call_assignment_no_alias(dst, src, func); } @@ -715,14 +719,16 @@ EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& fun // by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template class StorageBase, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment(NoAlias& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(NoAlias& dst, const Src& src, const Func& func) { call_assignment_no_alias(dst.expression(), src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) { enum { NeedToTranspose = ( (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) @@ -747,13 +753,15 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Assignment::run(actualDst, src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src) { call_assignment_no_alias(dst, src, internal::assign_op()); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) { Index dstRows = src.rows(); Index dstCols = src.cols(); @@ -767,7 +775,8 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src Assignment::run(dst, src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { call_assignment_no_alias_no_transpose(dst, src, internal::assign_op()); } @@ -779,7 +788,8 @@ template void check_for_aliasing(const Dst &dst, con template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment { - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 7776948d1..a729e0454 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -148,7 +148,8 @@ struct evaluator > EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) return m_data[row * m_outerStride.value() + col]; @@ -156,12 +157,14 @@ struct evaluator > return m_data[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) return const_cast(m_data)[row * m_outerStride.value() + col]; @@ -169,12 +172,14 @@ struct evaluator > return const_cast(m_data)[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return const_cast(m_data)[index]; } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { if (IsRowMajor) @@ -184,12 +189,14 @@ struct evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return ploadt(m_data + index); } template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { if (IsRowMajor) @@ -201,6 +208,7 @@ struct evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return pstoret(const_cast(m_data) + index, x); @@ -260,45 +268,53 @@ struct unary_evaluator, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(col, row); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); } - EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename XprType::Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(col, row); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet(index); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket(col, row, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket(index, x); @@ -338,23 +354,27 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(index); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.template packetOp(row, col); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.template packetOp(index); @@ -380,7 +400,8 @@ struct unary_evaluator, IndexBased > Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { @@ -390,23 +411,27 @@ struct unary_evaluator, IndexBased > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_argImpl.template packet(row, col)); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); @@ -466,17 +491,20 @@ struct binary_evaluator, IndexBased, IndexBase typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_lhsImpl.template packet(row, col), @@ -484,6 +512,7 @@ struct binary_evaluator, IndexBased, IndexBase } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_lhsImpl.template packet(index), @@ -523,22 +552,26 @@ struct unary_evaluator, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_unaryOp(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_unaryOp(m_argImpl.coeff(index)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_unaryOp(m_argImpl.coeffRef(row, col)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_unaryOp(m_argImpl.coeffRef(index)); } @@ -578,47 +611,55 @@ struct mapbase_evaluator : evaluator_base EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index * m_xpr.innerStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_data[index * m_xpr.innerStride()]; } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::ploadt(ptr); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return internal::ploadt(m_data + index * m_xpr.innerStride()); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::pstoret(ptr, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { internal::pstoret(m_data + index * m_xpr.innerStride(), x); @@ -767,46 +808,54 @@ struct unary_evaluator, IndexBa RowsAtCompileTime = XprType::RowsAtCompileTime }; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return packet(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return writePacket(RowsAtCompileTime == 1 ? 0 : index, @@ -859,7 +908,7 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - inline EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) @@ -869,7 +918,8 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (m_conditionImpl.coeff(row, col)) return m_thenImpl.coeff(row, col); @@ -877,7 +927,8 @@ struct evaluator > return m_elseImpl.coeff(row, col); } - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { if (m_conditionImpl.coeff(index)) return m_thenImpl.coeff(index); @@ -921,7 +972,8 @@ struct unary_evaluator > m_cols(replicate.nestedExpression().cols()) {} - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 @@ -934,7 +986,8 @@ struct unary_evaluator > return m_argImpl.coeff(actual_row, actual_col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_index = internal::traits::RowsAtCompileTime==1 @@ -945,6 +998,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 @@ -958,6 +1012,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { const Index actual_index = internal::traits::RowsAtCompileTime==1 @@ -1008,7 +1063,8 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const { if (Direction==Vertical) return m_functor(m_arg.col(j)); @@ -1016,7 +1072,8 @@ struct evaluator > return m_functor(m_arg.row(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const { if (Direction==Vertical) return m_functor(m_arg.col(index)); @@ -1051,45 +1108,53 @@ struct evaluator_wrapper_base typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(row, col); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet(index); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket(row, col, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket(index, x); @@ -1164,29 +1229,34 @@ struct unary_evaluator > m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { enum { @@ -1201,6 +1271,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { enum { PacketSize = unpacket_traits::size }; @@ -1208,6 +1279,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { // FIXME we could factorize some code with packet(i,j) @@ -1224,6 +1296,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { enum { PacketSize = unpacket_traits::size }; @@ -1267,22 +1340,26 @@ struct evaluator > typedef typename internal::conditional::value, typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index) const { return m_argImpl.coeff(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index + rowOffset(), index + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index) { return m_argImpl.coeffRef(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index + rowOffset(), index + colOffset()); } diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 8c182303c..ff1391996 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -61,26 +61,26 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type XprTypeNested; typedef typename internal::remove_all::type NestedExpression; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_xpr.cols(); } /** \returns the functor representing the unary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_all::type& nestedExpression() { return m_xpr; } diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 2c5c0ad28..ea8283cea 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -275,13 +275,13 @@ template class DenseBase /** Copies \a other into *this. \returns a reference to *this. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); template @@ -388,9 +388,9 @@ template class DenseBase inline bool hasNaN() const; inline bool allFinite() const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE inline Derived& operator*=(const Scalar& other); - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE inline Derived& operator/=(const Scalar& other); typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 338879c73..3770ab257 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -135,14 +135,14 @@ template class MatrixBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other); // We cannot inherit here via Base::operator= since it is causing // trouble with MSVC. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); template @@ -154,10 +154,10 @@ template class MatrixBase Derived& operator=(const ReturnByValue& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); #ifdef __CUDACC__ -- cgit v1.2.3 From 4a2ddfb81dbbb07c2f399d7bc9664f66abc1e65a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 10:44:15 -0800 Subject: Sharded the CUDA argmax tensor test --- unsupported/test/cxx11_tensor_argmax_cuda.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 48cec510d..45311d4f7 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -242,10 +242,10 @@ void test_cuda_argmin_dim() void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); + CALL_SUBTEST_1(test_cuda_simple_argmax()); + CALL_SUBTEST_1(test_cuda_simple_argmax()); + CALL_SUBTEST_2(test_cuda_argmax_dim()); + CALL_SUBTEST_2(test_cuda_argmax_dim()); + CALL_SUBTEST_3(test_cuda_argmin_dim()); + CALL_SUBTEST_3(test_cuda_argmin_dim()); } -- cgit v1.2.3 From 3f1ee458333ab59218342d595b60536aee760f6a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 10:48:49 -0800 Subject: Fixed compilation errors triggered by duplicate inline declaration --- Eigen/src/Core/CwiseUnaryOp.h | 2 +- Eigen/src/Core/DenseBase.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index ff1391996..1d2dd19f2 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -62,7 +62,7 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type NestedExpression; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index ea8283cea..5a38e5f22 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -389,9 +389,9 @@ template class DenseBase inline bool allFinite() const; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - inline Derived& operator*=(const Scalar& other); + Derived& operator*=(const Scalar& other); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - inline Derived& operator/=(const Scalar& other); + Derived& operator/=(const Scalar& other); typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; /** \returns the matrix or vector obtained by evaluating this expression. -- cgit v1.2.3 From 6720b38fbf60d750393af7d63777b06438ba5d81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 16:48:50 -0800 Subject: Fixed a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 5 ++++- unsupported/test/cxx11_tensor_empty.cpp | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 18a916e46..ed933b6ac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -41,7 +41,10 @@ class TensorStorage private: static const std::size_t Size = FixedDimensions::total_size; - EIGEN_ALIGN_MAX T m_data[Size]; + // Allocate an array of size at least one to prevent compiler warnings. + static const std::size_t MinSize = max_n_1::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + FixedDimensions m_dimensions; public: diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp index ca03a297c..9130fff35 100644 --- a/unsupported/test/cxx11_tensor_empty.cpp +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -16,16 +16,20 @@ static void test_empty_tensor() { Tensor source; Tensor tgt1 = source; - Tensor tgt2; - tgt2 = source; + Tensor tgt2(source); + Tensor tgt3; + tgt3 = tgt1; + tgt3 = tgt2; } static void test_empty_fixed_size_tensor() { TensorFixedSize> source; TensorFixedSize> tgt1 = source; - TensorFixedSize> tgt2; - tgt2 = source; + TensorFixedSize> tgt2(source); + TensorFixedSize> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; } -- cgit v1.2.3 From e80ed948e14c2de929a97bfbacab0b3a9172a59e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 20:09:41 -0800 Subject: Fixed a number of compilation warnings generated by the cuda tests --- .../Eigen/CXX11/src/Core/util/EmulateArray.h | 39 ++++++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 8 ++--- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 4 +-- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 456b34d0b..89aeb03e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -25,6 +25,16 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static std::size_t size() { return n; } @@ -123,13 +133,33 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[] (size_t) { eigen_assert(false && "Can't index a zero size array"); - return *static_cast(NULL); + return dummy; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t) const { eigen_assert(false && "Can't index a zero size array"); - return *static_cast(NULL); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; } static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } @@ -142,6 +172,9 @@ template class array { eigen_assert(l.size() == 0); } #endif + + private: + T dummy; }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 367a152a0..67c797802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { -template +template class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array& kernel_dims, @@ -123,7 +123,7 @@ class IndexMapper { } inputIndex += p * m_inputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -147,7 +147,7 @@ class IndexMapper { } outputIndex += p * m_outputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -206,7 +206,7 @@ class IndexMapper { } private: - static const size_t NumDims = internal::array_size::value; + static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; array m_cudaInputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index a03b52629..22aea5ea4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -463,7 +463,7 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides.back() = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } @@ -479,7 +479,7 @@ struct TensorEvaluator, Device> input_strides[i] = input_strides[i-1] * input_dims[i-1]; } } else { - input_strides[NumInputDims - 1] = 1; + input_strides.back() = 1; for (int i = NumInputDims - 2; i >= 0; --i) { input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; } -- cgit v1.2.3 From 2c3224924b8a290cbc33847d20103ec0db479828 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 10:23:45 +0100 Subject: Fix warning and replace min/max macros by calls to mini/maxi --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 960df4a46..933cd564b 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -98,9 +98,6 @@ namespace internal { /* === Definitions ========================================================== */ /* ========================================================================== */ -#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b)) -#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b)) - #define ONES_COMPLEMENT(r) (-(r)-1) /* -------------------------------------------------------------------------- */ @@ -735,8 +732,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; - dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -800,7 +797,7 @@ static void init_scoring else { /* keep track of max degree of remaining rows */ - max_deg = COLAMD_MAX (max_deg, deg) ; + max_deg = numext::maxi(max_deg, deg) ; } } COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ; @@ -838,7 +835,7 @@ static void init_scoring /* add row's external degree */ score += Row [row].shared1.degree - 1 ; /* guard against integer overflow */ - score = COLAMD_MIN (score, n_col) ; + score = numext::mini(score, n_col) ; } /* determine pruned column length */ col_length = (IndexType) (new_cp - &A [Col [c].start]) ; @@ -910,7 +907,7 @@ static void init_scoring head [score] = c ; /* see if this score is less than current min */ - min_score = COLAMD_MIN (min_score, score) ; + min_score = numext::mini(min_score, score) ; } @@ -1036,7 +1033,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Garbage_collection, if necessary ============================= */ - needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ; + needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; @@ -1095,7 +1092,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* clear tag on pivot column */ Col [pivot_col].shared1.thickness = pivot_col_thickness ; - max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ; + max_deg = numext::maxi(max_deg, pivot_row_degree) ; /* === Kill all rows used to construct pivot row ==================== */ @@ -1269,7 +1266,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* add set difference */ cur_score += row_mark - tag_mark ; /* integer overflow... */ - cur_score = COLAMD_MIN (cur_score, n_col) ; + cur_score = numext::mini(cur_score, n_col) ; } /* recompute the column's length */ @@ -1382,7 +1379,7 @@ static IndexType find_ordering /* return the number of garbage collections */ cur_score -= Col [col].shared1.thickness ; /* make sure score is less or equal than the max score */ - cur_score = COLAMD_MIN (cur_score, max_score) ; + cur_score = numext::mini(cur_score, max_score) ; COLAMD_ASSERT (cur_score >= 0) ; /* store updated score */ @@ -1405,7 +1402,7 @@ static IndexType find_ordering /* return the number of garbage collections */ head [cur_score] = col ; /* see if this score is less than current min */ - min_score = COLAMD_MIN (min_score, cur_score) ; + min_score = numext::mini(min_score, cur_score) ; } -- cgit v1.2.3 From e1d219e5c9ea782550882aa8eb131b107f05105e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 14:25:34 +0100 Subject: bug #698: fix linspaced for integer types. --- Eigen/src/Core/functors/NullaryFunctors.h | 63 ++++++++++++++++++++++++------- test/nullary.cpp | 31 +++++++++------ 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index cd9fbf267..71629af4c 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,7 +37,7 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; // linear access for packet ops: // 1) initialization @@ -48,12 +48,12 @@ template struct linspaced_ // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp) // in order to avoid the padd() in operator() ? template -struct linspaced_op_impl +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_packetStep(pset1(unpacket_traits::size*step)), - m_base(padd(pset1(low), pmul(pset1(step),plset(-unpacket_traits::size)))) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_packetStep(pset1(unpacket_traits::size*m_step)), + m_base(padd(pset1(low), pmul(pset1(m_step),plset(-unpacket_traits::size)))) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const @@ -75,11 +75,11 @@ struct linspaced_op_impl // 1) each step // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) template -struct linspaced_op_impl +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_lowPacket(pset1(m_low)), m_stepPacket(pset1(m_step)), m_interPacket(plset(0)) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_lowPacket(pset1(m_low)), m_stepPacket(pset1(m_step)), m_interPacket(plset(0)) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } @@ -95,6 +95,31 @@ struct linspaced_op_impl const Packet m_interPacket; }; +template +struct linspaced_op_impl +{ + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_length(high-low), m_numSteps(num_steps), m_interPacket(plset(0)) + {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar operator() (Index i) const { + return m_low + (m_length*Scalar(i))/(m_numSteps-1); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Packet packetOp(Index i) const { + return internal::padd(pset1(m_low), pdiv(pmul(pset1(m_length), padd(pset1(Scalar(i)),m_interPacket)), + pset1(m_numSteps-1))); } + + const Scalar m_low; + const Scalar m_length; + const Index m_numSteps; + const Packet m_interPacket; +}; + // ----- Linspace functor ---------------------------------------------------------------- // Forward declaration (we default to random access which does not really give @@ -102,10 +127,20 @@ struct linspaced_op_impl // nested expressions). template struct linspaced_op; template struct functor_traits< linspaced_op > -{ enum { Cost = 1, PacketAccess = packet_traits::HasSetLinear, IsRepeatable = true }; }; +{ + enum + { + Cost = 1, + PacketAccess = packet_traits::HasSetLinear + && ((!NumTraits::IsInteger) || packet_traits::HasDiv), + IsRepeatable = true + }; +}; template struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {} + linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + : impl((num_steps==1 ? high : low),high,num_steps) + {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); } @@ -134,7 +169,9 @@ template struct linspa // This proxy object handles the actual required temporaries, the different // implementations (random vs. sequential access) as well as the // correct piping to size 2/4 packet operations. - const linspaced_op_impl impl; + // As long as we don't have a Bresenham-like implementation for linear-access and integer types, + // we have to by-pass RandomAccess for integer types. See bug 698. + const linspaced_op_impl::IsInteger?true:RandomAccess),NumTraits::IsInteger> impl; }; // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta diff --git a/test/nullary.cpp b/test/nullary.cpp index 4844f2952..8d65910eb 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -48,30 +48,32 @@ void testVectorType(const VectorType& base) VectorType m(base); m.setLinSpaced(size,low,high); + if(!NumTraits::IsInteger) + { + VectorType n(size); + for (int i=0; i::epsilon() ); - - // These guys sometimes fail! This is not good. Any ideas how to fix them!? - //VERIFY( m(m.size()-1) == high ); - //VERIFY( m(0) == low ); + VERIFY( internal::isApprox(m(m.size()-1),high) ); + VERIFY( size==1 || internal::isApprox(m(0),low) ); // sequential access version m = VectorType::LinSpaced(Sequential,size,low,high); VERIFY_IS_APPROX(m,n); - // These guys sometimes fail! This is not good. Any ideas how to fix them!? - //VERIFY( m(m.size()-1) == high ); - //VERIFY( m(0) == low ); + VERIFY( internal::isApprox(m(m.size()-1),high) ); + VERIFY( size==1 || internal::isApprox(m(0),low) ); // check whether everything works with row and col major vectors Matrix row_vector(size); @@ -126,5 +128,12 @@ void test_nullary() CALL_SUBTEST_8( testVectorType(Vector4f()) ); CALL_SUBTEST_8( testVectorType(Matrix()) ); CALL_SUBTEST_8( testVectorType(Matrix()) ); + + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,300))) ); } + +#ifdef EIGEN_TEST_PART_6 + // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79). + VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits::epsilon() ); +#endif } -- cgit v1.2.3 From 6e0a86194ce6664e83d8035cbdd6047e5a27ed43 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 15:00:04 +0100 Subject: Fix integer path for num_steps==1 --- Eigen/src/Core/functors/NullaryFunctors.h | 8 ++++---- test/nullary.cpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 71629af4c..c5836d048 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -99,24 +99,24 @@ template struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_length(high-low), m_numSteps(num_steps), m_interPacket(plset(0)) + m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset(0)) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { - return m_low + (m_length*Scalar(i))/(m_numSteps-1); + return m_low + (m_length*Scalar(i))/m_divisor; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return internal::padd(pset1(m_low), pdiv(pmul(pset1(m_length), padd(pset1(Scalar(i)),m_interPacket)), - pset1(m_numSteps-1))); } + pset1(m_divisor))); } const Scalar m_low; const Scalar m_length; - const Index m_numSteps; + const Index m_divisor; const Packet m_interPacket; }; diff --git a/test/nullary.cpp b/test/nullary.cpp index 8d65910eb..cb87695ee 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -130,6 +130,7 @@ void test_nullary() CALL_SUBTEST_8( testVectorType(Matrix()) ); CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,300))) ); + CALL_SUBTEST_9( testVectorType(Matrix()) ); } #ifdef EIGEN_TEST_PART_6 -- cgit v1.2.3 From ec469700dcf82fd9b5668fe3c82d9dac49d147df Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 15:04:33 +0100 Subject: bug #557: make InnerIterator of sparse storage types more versatile by adding default-ctor, copy-ctor/assignment --- Eigen/src/SparseCore/SparseCompressedBase.h | 21 ++++++++++++++++++++- test/sparse_basic.cpp | 27 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index f78d7c24d..ea71b41d1 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -117,6 +117,24 @@ template class SparseCompressedBase::InnerIterator { public: + InnerIterator() + : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0) + {} + + InnerIterator(const InnerIterator& other) + : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end) + {} + + InnerIterator& operator=(const InnerIterator& other) + { + m_values = other.m_values; + m_indices = other.m_indices; + const_cast(m_outer).setValue(other.m_outer.value()); + m_id = other.m_id; + m_end = other.m_end; + return *this; + } + InnerIterator(const SparseCompressedBase& mat, Index outer) : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer) { @@ -162,7 +180,8 @@ class SparseCompressedBase::InnerIterator protected: const Scalar* m_values; const StorageIndex* m_indices; - const internal::variable_if_dynamic m_outer; + typedef internal::variable_if_dynamic OuterType; + const OuterType m_outer; Index m_id; Index m_end; private: diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 5a5650705..0a06c828b 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -460,6 +460,33 @@ template void sparse_basic(const SparseMatrixType& re refMat1.setIdentity(); VERIFY_IS_APPROX(m1, refMat1); } + + // test array/vector of InnerIterator + { + typedef typename SparseMatrixType::InnerIterator IteratorType; + + DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); + SparseMatrixType m2(rows, cols); + initSparse(density, refMat2, m2); + IteratorType static_array[2]; + static_array[0] = IteratorType(m2,0); + static_array[1] = IteratorType(m2,m2.outerSize()-1); + VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 ); + VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 ); + if(static_array[0] && static_array[1]) + { + ++(static_array[1]); + static_array[1] = IteratorType(m2,0); + VERIFY( static_array[1] ); + VERIFY( static_array[1].index() == static_array[0].index() ); + VERIFY( static_array[1].outer() == static_array[0].outer() ); + VERIFY( static_array[1].value() == static_array[0].value() ); + } + + std::vector iters(2); + iters[0] = IteratorType(m2,0); + iters[1] = IteratorType(m2,m2.outerSize()-1); + } } -- cgit v1.2.3 From ff1157bcbf1998c80d96612ed201b6a20db2de5f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 16:09:34 +0100 Subject: bug #694: document that SparseQR::matrixR is not sorted. --- Eigen/src/SparseQR/SparseQR.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 0d448d02e..acd7f7e10 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -128,6 +128,17 @@ class SparseQR : public SparseSolverBase > inline Index cols() const { return m_pmat.cols();} /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization. + * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms + * expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()), + * and coefficient-wise operations. Matrix products and triangular solves are fine though. + * + * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix + * is required, you can copy it again: + * \code + * SparseMatrix R = qr.matrixR(); // column-major, not sorted! + * SparseMatrix Rr = qr.matrixR(); // row-major, sorted + * SparseMatrix Rc = Rr; // column-major, sorted + * \endcode */ const QRMatrixType& matrixR() const { return m_R; } -- cgit v1.2.3 From 11bb71c8fcd1fa0f663cedda707a36d40952eca9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 07:34:59 -0800 Subject: Sharded the tensor device test --- unsupported/test/cxx11_tensor_device.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index ed5dd7505..80cf9ffba 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -383,6 +383,6 @@ static void test_gpu() { void test_cxx11_tensor_device() { - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); + CALL_SUBTEST_1(test_cpu()); + CALL_SUBTEST_2(test_gpu()); } -- cgit v1.2.3 From 264f8141f86e84312f0eea9e741d2260ed839890 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 07:44:31 -0800 Subject: Shared the tensor reduction test --- unsupported/test/cxx11_tensor_reduction_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 417242586..cad0c08e0 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -54,6 +54,6 @@ static void test_full_reductions() { } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST_1(test_full_reductions()); + CALL_SUBTEST_2(test_full_reductions()); } -- cgit v1.2.3 From 6b5dff875e4ba2235f255b7cf0a86b7abed21df0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 12:46:32 -0800 Subject: Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 12 +++++++++--- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 5abdc489b..e684ab8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface { struct GpuDevice { // The StreamInterface is not owned: the caller is // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { eigen_assert(stream); } - // TODO(bsteiner): This is an internal API, we should not expose it. EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return stream_->stream(); @@ -246,6 +248,10 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { @@ -259,7 +265,7 @@ struct GpuDevice { private: const StreamInterface* stream_; - + int max_blocks_; }; #ifndef __CUDA_ARCH__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d2ab70f2b..df15c6204 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); -- cgit v1.2.3 From 922b5f527b56ba2fcf1e5a4da5216a29afdbb312 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:30:49 -0800 Subject: Silenced a few compilation warnings --- unsupported/test/cxx11_tensor_device.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index 80cf9ffba..cbe9e6449 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -109,19 +109,19 @@ struct GPUContext { // The actual expression to evaluate template -static void test_contextual_eval(Context* context) +void test_contextual_eval(Context* context) { context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template -static void test_forced_contextual_eval(Context* context) +void test_forced_contextual_eval(Context* context) { context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } template -static void test_compound_assignment(Context* context) +void test_compound_assignment(Context* context) { context->out().device(context->device()) = context->in1().constant(2.718f); context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; @@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context) template -static void test_contraction(Context* context) +void test_contraction(Context* context) { Eigen::array, 2> dims; dims[0] = std::make_pair(1, 1); @@ -145,7 +145,7 @@ static void test_contraction(Context* context) template -static void test_1d_convolution(Context* context) +void test_1d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(40,49,70); @@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context) } template -static void test_2d_convolution(Context* context) +void test_2d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(40,49,69); @@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context) } template -static void test_3d_convolution(Context* context) +void test_3d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(39,49,69); @@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context) } -static void test_cpu() { +void test_cpu() { Eigen::Tensor in1(40,50,70); Eigen::Tensor in2(40,50,70); Eigen::Tensor out(40,50,70); @@ -267,7 +267,7 @@ static void test_cpu() { } } -static void test_gpu() { +void test_gpu() { Eigen::Tensor in1(40,50,70); Eigen::Tensor in2(40,50,70); Eigen::Tensor out(40,50,70); -- cgit v1.2.3 From 0ce5d32be583c0a2592158ad59ce7ad11125d645 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:33:23 -0800 Subject: Sharded the cxx11_tensor_contract_cuda test --- unsupported/test/cxx11_tensor_contract_cuda.cu | 48 ++++++++++++++------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index cbd902d6a..2c3cf64a9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -22,7 +22,7 @@ using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; template -static void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_cuda_contraction(int m_size, int k_size, int n_size) { std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is @@ -88,37 +88,39 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { std::cout << "Calling contraction tests" << std::endl; - CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST_2(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST_3(test_cuda_contraction(128, k, 128)); } for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST_4(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST_5(test_cuda_contraction(128, 128, k)); } for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST_6(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST_7(test_cuda_contraction(k, 128, 128)); } - int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; + static const int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + static const int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; + static const int k_sizes[] = {31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; - for (int i = 0; i <15; i++) - for (int j = 0; j < 15; j++) + for (int i = 0; i <15; i++) { + for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST_8(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST_9(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); } + } + } } -- cgit v1.2.3 From 64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:57:41 -0800 Subject: Cleaned up a tensor contraction test --- unsupported/test/cxx11_tensor_contract_cuda.cu | 86 +++++++++++++++++--------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 2c3cf64a9..6d1ef07f9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -24,14 +24,14 @@ typedef Tensor::DimensionPair DimPair; template void test_cuda_contraction(int m_size, int k_size, int n_size) { - std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor t_left(Eigen::array(m_size, k_size)); - Tensor t_right(Eigen::array(k_size, n_size)); - Tensor t_result(Eigen::array(m_size, n_size)); - Tensor t_result_gpu(Eigen::array(m_size, n_size)); + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result(m_size, n_size); + Tensor t_result_gpu(m_size, n_size); Eigen::array dims(DimPair(1, 0)); t_left.setRandom(); @@ -84,43 +84,69 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } - -void test_cxx11_tensor_cuda() -{ - std::cout << "Calling contraction tests" << std::endl; - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); +template +void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_2(test_cuda_contraction(128, k, 128)); - CALL_SUBTEST_3(test_cuda_contraction(128, k, 128)); + test_cuda_contraction(k, 128, 128); + test_cuda_contraction(k, 128, 128); } +} + +template +void test_cuda_contraction_k() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_4(test_cuda_contraction(128, 128, k)); - CALL_SUBTEST_5(test_cuda_contraction(128, 128, k)); + test_cuda_contraction(128, k, 128); + test_cuda_contraction(128, k, 128); } +} + +template +void test_cuda_contraction_n() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_6(test_cuda_contraction(k, 128, 128)); - CALL_SUBTEST_7(test_cuda_contraction(k, 128, 128)); + test_cuda_contraction(128, 128, k); + test_cuda_contraction(128, 128, k); } +} - static const int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - static const int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - static const int k_sizes[] = {31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; +template +void test_cuda_contraction_sizes() { + int m_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257 , 511, + 512, 513, 1023, 1024, 1025}; + + int n_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; - for (int i = 0; i <15; i++) { + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i < 15; i++) { for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST_8(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST_9(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); } } } } + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + + CALL_SUBTEST_2(test_cuda_contraction_m()); + CALL_SUBTEST_3(test_cuda_contraction_m()); + + CALL_SUBTEST_4(test_cuda_contraction_k()); + CALL_SUBTEST_5(test_cuda_contraction_k()); + + CALL_SUBTEST_6(test_cuda_contraction_n()); + CALL_SUBTEST_7(test_cuda_contraction_n()); + + CALL_SUBTEST_8(test_cuda_contraction_sizes()); + CALL_SUBTEST_9(test_cuda_contraction_sizes()); +} -- cgit v1.2.3 From aedea349aabb44d51a4e64cd2c96242f0cea95ba Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Mon, 1 Feb 2016 20:25:02 -0700 Subject: Replace separate low word constructors with a single templated constructor. --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 21 ++++++++------------- unsupported/test/cxx11_tensor_uint128.cpp | 2 +- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 0d34f7ee6..981515f4b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -33,24 +33,19 @@ struct TensorUInt128 HIGH high; LOW low; + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int32_t x) : high(0), low(x) { - eigen_assert(x >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(uint32_t x) : high(0), low(x) { } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(long x) : high(0), low(x) { - eigen_assert(x >= 0); + TensorUInt128(const TensorUInt128& other) : high(other.high), low(other.low) { + static_assert(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide"); + static_assert(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide"); } + + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned long x) : high(0), low(x) { } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int64_t x) : high(0), low(x) { + explicit TensorUInt128(const T& x) : high(0), low(x) { eigen_assert(x >= 0); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(uint64_t x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { } diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index ee3767e58..424c70197 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -127,7 +127,7 @@ void test_misc2() { TensorUInt128 result = (TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - TensorUInt128, static_val<0> >(1, 0) + TensorUInt128, static_val<1> >(1)); uint64_t actual = static_cast(result); - VERIFY_EQUAL(actual, expected); + VERIFY_IS_EQUAL(actual, expected); } } } -- cgit v1.2.3 From 99cde88341145c43fc4134af07556d8c6ff12066 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 2 Feb 2016 11:06:53 -0800 Subject: Don't try to use direct offsets when computing a tensor product, since the required stride isn't available. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 63c8ae126..392aa6d37 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -344,7 +344,7 @@ class TensorContractionSubMapper { enum { // We can use direct offsets iff the parent mapper supports then and we can compute the strides. // TODO: we should also enable direct offsets for the Rhs case. - UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets + UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) }; EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) -- cgit v1.2.3 From 783018d8f65faeec0fc6f795bc2630240ecdd051 Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 2 Feb 2016 16:45:12 -0700 Subject: Use EIGEN_STATIC_ASSERT for backward compatibility. --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 981515f4b..52d5b7b1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -36,8 +36,8 @@ struct TensorUInt128 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128& other) : high(other.high), low(other.low) { - static_assert(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide"); - static_assert(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide"); + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide"); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide"); } template -- cgit v1.2.3 From c85fbfd0b747b9af48144bab9a79127ab2b6257b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 3 Feb 2016 16:08:43 +0100 Subject: Clarify documentation on the restrictions of writable sparse block expressions. --- doc/SparseQuickReference.dox | 62 +++++++++++++++++++++++++------------------- doc/TutorialSparse.dox | 24 +++++++++++++++-- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index d04ac35c5..e0a30edcc 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -21,7 +21,7 @@ i.e either row major or column major. The default is column major. Most arithmet Resize/Reserve \code - sm1.resize(m,n); //Change sm1 to a m x n matrix. + sm1.resize(m,n); // Change sm1 to a m x n matrix. sm1.reserve(nnz); // Allocate room for nnz nonzeros elements. \endcode @@ -151,10 +151,10 @@ It is easy to perform arithmetic operations on sparse matrices provided that the Permutation \code -perm.indices(); // Reference to the vector of indices +perm.indices(); // Reference to the vector of indices sm1.twistedBy(perm); // Permute rows and columns -sm2 = sm1 * perm; //Permute the columns -sm2 = perm * sm1; // Permute the columns +sm2 = sm1 * perm; // Permute the columns +sm2 = perm * sm1; // Permute the columns \endcode @@ -181,9 +181,9 @@ sm2 = perm * sm1; // Permute the columns \section sparseotherops Other supported operations - + + - + - - + + - + + - - - + + + - - +
Operations Code Notes
Code Notes
Sub-matrices
Sub-matrices \code sm1.block(startRow, startCol, rows, cols); @@ -193,25 +193,31 @@ sm2 = perm * sm1; // Permute the columns sm1.bottomLeftCorner( rows, cols); sm1.bottomRightCorner( rows, cols); \endcode - +Contrary to dense matrices, here all these methods are read-only.\n +See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices. +
Range
Range
\code - sm1.innerVector(outer); - sm1.innerVectors(start, size); - sm1.leftCols(size); - sm2.rightCols(size); - sm1.middleRows(start, numRows); - sm1.middleCols(start, numCols); - sm1.col(j); + sm1.innerVector(outer); // RW + sm1.innerVectors(start, size); // RW + sm1.leftCols(size); // RW + sm2.rightCols(size); // RO because sm2 is row-major + sm1.middleRows(start, numRows); // RO becasue sm1 is column-major + sm1.middleCols(start, numCols); // RW + sm1.col(j); // RW \endcode A inner vector is either a row (for row-major) or a column (for column-major). As stated earlier, the evaluation can be done in a matrix with different storage order +A inner vector is either a row (for row-major) or a column (for column-major).\n +As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order. +
Triangular and selfadjoint views
Triangular and selfadjoint views \code sm2 = sm1.triangularview(); @@ -222,26 +228,30 @@ sm2 = perm * sm1; // Permute the columns \code \endcode
Triangular solve
Triangular solve
\code dv2 = sm1.triangularView().solve(dv1); - dv2 = sm1.topLeftCorner(size, size).triangularView().solve(dv1); + dv2 = sm1.topLeftCorner(size, size) + .triangularView().solve(dv1); \endcode For general sparse solve, Use any suitable module described at \ref TopicSparseSystems
Low-level API
Low-level API \code -sm1.valuePtr(); // Pointer to the values -sm1.innerIndextr(); // Pointer to the indices. -sm1.outerIndexPtr(); //Pointer to the beginning of each inner vector +sm1.valuePtr(); // Pointer to the values +sm1.innerIndextr(); // Pointer to the indices. +sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector \endcode If the matrix is not in compressed form, makeCompressed() should be called before. Note that these functions are mostly provided for interoperability purposes with external libraries. A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section +If the matrix is not in compressed form, makeCompressed() should be called before.\n +Note that these functions are mostly provided for interoperability purposes with external libraries.\n +A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section
*/ diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index 1f0be387d..352907408 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -241,11 +241,11 @@ In the following \em sm denotes a sparse matrix, \em sv a sparse vector, \em dm sm1.real() sm1.imag() -sm1 0.5*sm1 sm1+sm2 sm1-sm2 sm1.cwiseProduct(sm2) \endcode -However, a strong restriction is that the storage orders must match. For instance, in the following example: +However, a strong restriction is that the storage orders must match. For instance, in the following example: \code sm4 = sm1 + sm2 + sm3; \endcode -sm1, sm2, and sm3 must all be row-major or all column major. +sm1, sm2, and sm3 must all be row-major or all column-major. On the other hand, there is no restriction on the target matrix sm4. For instance, this means that for computing \f$ A^T + A \f$, the matrix \f$ A^T \f$ must be evaluated into a temporary matrix of compatible storage order: \code @@ -311,6 +311,26 @@ sm2 = sm1.transpose() * P; \endcode +\subsection TutorialSparse_SubMatrices Block operations + +Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction. +However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as block(...) and corner*(...). The available API for write-access to a SparseMatrix are summarized below: +\code +SparseMatrix sm1; +sm1.col(j) = ...; +sm1.leftCols(ncols) = ...; +sm1.middleCols(j,ncols) = ...; +sm1.rightCols(ncols) = ...; + +SparseMatrix sm2; +sm2.row(i) = ...; +sm2.topRows(nrows) = ...; +sm2.middleRows(i,nrows) = ...; +sm2.bottomRows(nrows) = ...; +\endcode + +In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage. + \subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side: -- cgit v1.2.3 From eb6d9aea0e27c3161db3c57d415a3996d60b80bc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 3 Feb 2016 16:58:23 +0100 Subject: Clarify error message when writing to a read-only sparse-sub-matrix. --- Eigen/src/SparseCore/SparseBlock.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 43fe788d9..3a811113f 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -73,8 +73,15 @@ public: Index m_outerStart; const internal::variable_if_dynamic m_outerSize; - public: - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) + protected: + // Disable assignment with clear error message. + // Note that simply removing operator= yields compilation errors with ICC+MSVC + template + BlockImpl& operator=(const T&) + { + EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY); + return *this; + } }; @@ -424,8 +431,6 @@ public: friend struct internal::unary_evaluator, internal::IteratorBased, Scalar >; Index nonZeros() const { return Dynamic; } - - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) typename internal::ref_selector::non_const_type m_matrix; const internal::variable_if_dynamic m_startRow; @@ -433,6 +438,16 @@ public: const internal::variable_if_dynamic m_blockRows; const internal::variable_if_dynamic m_blockCols; + protected: + // Disable assignment with clear error message. + // Note that simply removing operator= yields compilation errors with ICC+MSVC + template + BlockImpl& operator=(const T&) + { + EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY); + return *this; + } + }; namespace internal { -- cgit v1.2.3 From c301f99208d8aa9985c6a029d602622a60fb5e7b Mon Sep 17 00:00:00 2001 From: Damien R Date: Wed, 3 Feb 2016 18:07:25 +0100 Subject: bug #1164: fix list and deque specializations such that our aligned allocator is automatically activatived only when the user did not specified an allocator (or specified the default std::allocator). --- Eigen/src/StlSupport/StdDeque.h | 18 ++-- Eigen/src/StlSupport/StdList.h | 18 ++-- test/CMakeLists.txt | 2 + test/stddeque_overload.cpp | 159 +++++++++++++++++++++++++++++++++ test/stdlist_overload.cpp | 192 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 363 insertions(+), 26 deletions(-) create mode 100644 test/stddeque_overload.cpp create mode 100644 test/stdlist_overload.cpp diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index 25930cb85..cf1fedf92 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -13,32 +13,24 @@ #include "details.h" -// Define the explicit instantiation (e.g. necessary for the Intel compiler) -#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC - #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >; -#else - #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) -#endif - /** * This section contains a convenience MACRO which allows an easy specialization of * std::deque such that for data types with alignment issues the correct allocator * is used automatically. */ #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \ -EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \ namespace std \ { \ - template \ - class deque<__VA_ARGS__, _Ay> \ + template<> \ + class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> > \ : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \ { \ typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \ public: \ typedef __VA_ARGS__ value_type; \ - typedef typename deque_base::allocator_type allocator_type; \ - typedef typename deque_base::size_type size_type; \ - typedef typename deque_base::iterator iterator; \ + typedef deque_base::allocator_type allocator_type; \ + typedef deque_base::size_type size_type; \ + typedef deque_base::iterator iterator; \ explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {} \ template \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h index 7412b50aa..e1eba4985 100644 --- a/Eigen/src/StlSupport/StdList.h +++ b/Eigen/src/StlSupport/StdList.h @@ -12,32 +12,24 @@ #include "details.h" -// Define the explicit instantiation (e.g. necessary for the Intel compiler) -#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC - #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >; -#else - #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) -#endif - /** * This section contains a convenience MACRO which allows an easy specialization of * std::list such that for data types with alignment issues the correct allocator * is used automatically. */ #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \ -EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \ namespace std \ { \ - template \ - class list<__VA_ARGS__, _Ay> \ + template<> \ + class list<__VA_ARGS__, std::allocator<__VA_ARGS__> > \ : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \ { \ typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \ public: \ typedef __VA_ARGS__ value_type; \ - typedef typename list_base::allocator_type allocator_type; \ - typedef typename list_base::size_type size_type; \ - typedef typename list_base::iterator iterator; \ + typedef list_base::allocator_type allocator_type; \ + typedef list_base::size_type size_type; \ + typedef list_base::iterator iterator; \ explicit list(const allocator_type& a = allocator_type()) : list_base(a) {} \ template \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bbebf29cd..4420e0c51 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -226,7 +226,9 @@ ei_add_test(geo_homogeneous) ei_add_test(stdvector) ei_add_test(stdvector_overload) ei_add_test(stdlist) +ei_add_test(stdlist_overload) ei_add_test(stddeque) +ei_add_test(stddeque_overload) ei_add_test(sparse_basic) ei_add_test(sparse_block) ei_add_test(sparse_vector) diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp new file mode 100644 index 000000000..d887e35ba --- /dev/null +++ b/test/stddeque_overload.cpp @@ -0,0 +1,159 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008 Benoit Jacob +// Copyright (C) 2010 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +#include + +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Vector4f) + +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix2f) +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4f) +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4d) + +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3f) +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3d) + +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaternionf) +EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond) + +template +void check_stddeque_matrix(const MatrixType& m) +{ + typename MatrixType::Index rows = m.rows(); + typename MatrixType::Index cols = m.cols(); + MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols); + std::deque v(10, MatrixType(rows,cols)), w(20, y); + v[5] = x; + w[6] = v[5]; + VERIFY_IS_APPROX(w[6], v[5]); + v = w; + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(w[i], v[i]); + } + + v.resize(21); + v[20] = x; + VERIFY_IS_APPROX(v[20], x); + v.resize(22,y); + VERIFY_IS_APPROX(v[21], y); + v.push_back(x); + VERIFY_IS_APPROX(v[22], x); + VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType)); + + // do a lot of push_back such that the deque gets internally resized + // (with memory reallocation) + MatrixType* ref = &w[0]; + for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i) + v.push_back(w[i%w.size()]); + for(unsigned int i=23; i +void check_stddeque_transform(const TransformType&) +{ + typedef typename TransformType::MatrixType MatrixType; + TransformType x(MatrixType::Random()), y(MatrixType::Random()); + std::deque v(10), w(20, y); + v[5] = x; + w[6] = v[5]; + VERIFY_IS_APPROX(w[6], v[5]); + v = w; + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(w[i], v[i]); + } + + v.resize(21); + v[20] = x; + VERIFY_IS_APPROX(v[20], x); + v.resize(22,y); + VERIFY_IS_APPROX(v[21], y); + v.push_back(x); + VERIFY_IS_APPROX(v[22], x); + + // do a lot of push_back such that the deque gets internally resized + // (with memory reallocation) + TransformType* ref = &w[0]; + for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i) + v.push_back(w[i%w.size()]); + for(unsigned int i=23; i +void check_stddeque_quaternion(const QuaternionType&) +{ + typedef typename QuaternionType::Coefficients Coefficients; + QuaternionType x(Coefficients::Random()), y(Coefficients::Random()); + std::deque v(10), w(20, y); + v[5] = x; + w[6] = v[5]; + VERIFY_IS_APPROX(w[6], v[5]); + v = w; + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(w[i], v[i]); + } + + v.resize(21); + v[20] = x; + VERIFY_IS_APPROX(v[20], x); + v.resize(22,y); + VERIFY_IS_APPROX(v[21], y); + v.push_back(x); + VERIFY_IS_APPROX(v[22], x); + + // do a lot of push_back such that the deque gets internally resized + // (with memory reallocation) + QuaternionType* ref = &w[0]; + for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i) + v.push_back(w[i%w.size()]); + for(unsigned int i=23; i +// Copyright (C) 2010 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +#include + +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Vector4f) + +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix2f) +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4f) +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4d) + +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3f) +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3d) + +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaternionf) +EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaterniond) + +template +typename Container::iterator get(Container & c, Position position) +{ + typename Container::iterator it = c.begin(); + std::advance(it, position); + return it; +} + +template +void set(Container & c, Position position, const Value & value) +{ + typename Container::iterator it = c.begin(); + std::advance(it, position); + *it = value; +} + +template +void check_stdlist_matrix(const MatrixType& m) +{ + typename MatrixType::Index rows = m.rows(); + typename MatrixType::Index cols = m.cols(); + MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols); + std::list v(10, MatrixType(rows,cols)), w(20, y); + typename std::list::iterator itv = get(v, 5); + typename std::list::iterator itw = get(w, 6); + *itv = x; + *itw = *itv; + VERIFY_IS_APPROX(*itw, *itv); + v = w; + itv = v.begin(); + itw = w.begin(); + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(*itw, *itv); + ++itv; + ++itw; + } + + v.resize(21); + set(v, 20, x); + VERIFY_IS_APPROX(*get(v, 20), x); + v.resize(22,y); + VERIFY_IS_APPROX(*get(v, 21), y); + v.push_back(x); + VERIFY_IS_APPROX(*get(v, 22), x); + + // do a lot of push_back such that the list gets internally resized + // (with memory reallocation) + MatrixType* ref = &(*get(w, 0)); + for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i) + v.push_back(*get(w, i%w.size())); + for(unsigned int i=23; i +void check_stdlist_transform(const TransformType&) +{ + typedef typename TransformType::MatrixType MatrixType; + TransformType x(MatrixType::Random()), y(MatrixType::Random()); + std::list v(10), w(20, y); + typename std::list::iterator itv = get(v, 5); + typename std::list::iterator itw = get(w, 6); + *itv = x; + *itw = *itv; + VERIFY_IS_APPROX(*itw, *itv); + v = w; + itv = v.begin(); + itw = w.begin(); + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(*itw, *itv); + ++itv; + ++itw; + } + + v.resize(21); + set(v, 20, x); + VERIFY_IS_APPROX(*get(v, 20), x); + v.resize(22,y); + VERIFY_IS_APPROX(*get(v, 21), y); + v.push_back(x); + VERIFY_IS_APPROX(*get(v, 22), x); + + // do a lot of push_back such that the list gets internally resized + // (with memory reallocation) + TransformType* ref = &(*get(w, 0)); + for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i) + v.push_back(*get(w, i%w.size())); + for(unsigned int i=23; imatrix()==get(w, (i-23)%w.size())->matrix()); + } +} + +template +void check_stdlist_quaternion(const QuaternionType&) +{ + typedef typename QuaternionType::Coefficients Coefficients; + QuaternionType x(Coefficients::Random()), y(Coefficients::Random()); + std::list v(10), w(20, y); + typename std::list::iterator itv = get(v, 5); + typename std::list::iterator itw = get(w, 6); + *itv = x; + *itw = *itv; + VERIFY_IS_APPROX(*itw, *itv); + v = w; + itv = v.begin(); + itw = w.begin(); + for(int i = 0; i < 20; i++) + { + VERIFY_IS_APPROX(*itw, *itv); + ++itv; + ++itw; + } + + v.resize(21); + set(v, 20, x); + VERIFY_IS_APPROX(*get(v, 20), x); + v.resize(22,y); + VERIFY_IS_APPROX(*get(v, 21), y); + v.push_back(x); + VERIFY_IS_APPROX(*get(v, 22), x); + + // do a lot of push_back such that the list gets internally resized + // (with memory reallocation) + QuaternionType* ref = &(*get(w, 0)); + for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i) + v.push_back(*get(w, i%w.size())); + for(unsigned int i=23; icoeffs()==get(w, (i-23)%w.size())->coeffs()); + } +} + +void test_stdlist_overload() +{ + // some non vectorizable fixed sizes + CALL_SUBTEST_1(check_stdlist_matrix(Vector2f())); + CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f())); + CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d())); + + // some vectorizable fixed sizes + CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f())); + CALL_SUBTEST_1(check_stdlist_matrix(Vector4f())); + CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f())); + CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d())); + + // some dynamic sizes + CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1))); + CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20))); + CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20))); + CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10))); + + // some Transform + CALL_SUBTEST_4(check_stdlist_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9 + CALL_SUBTEST_4(check_stdlist_transform(Affine3f())); + CALL_SUBTEST_4(check_stdlist_transform(Affine3d())); + + // some Quaternion + CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf())); + CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond())); +} -- cgit v1.2.3 From 70dc14e4e10c08bdfe77d8a32cf14b81af9022fa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 3 Feb 2016 18:25:41 +0100 Subject: bug #1161: fix division by zero for huge scalar types --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d2e6f26c8..54e118395 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -178,7 +178,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1)); + const Index max_kc = std::max(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { -- cgit v1.2.3 From d9a6f86cc080c54eaf78957efa44cb915d8ef179 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 3 Feb 2016 09:55:30 -0800 Subject: Make the array of directly compute column norms a member to avoid allocation in computeInPlace. --- Eigen/src/QR/ColPivHouseholderQR.h | 53 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 61c6fdf09..a13965ff0 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -86,7 +86,8 @@ template class ColPivHouseholderQR m_colsPermutation(), m_colsTranspositions(), m_temp(), - m_colNorms(), + m_colNormsUpdated(), + m_colNormsDirect(), m_isInitialized(false), m_usePrescribedThreshold(false) {} @@ -102,7 +103,8 @@ template class ColPivHouseholderQR m_colsPermutation(PermIndexType(cols)), m_colsTranspositions(cols), m_temp(cols), - m_colNorms(cols), + m_colNormsUpdated(cols), + m_colNormsDirect(cols), m_isInitialized(false), m_usePrescribedThreshold(false) {} @@ -125,7 +127,8 @@ template class ColPivHouseholderQR m_colsPermutation(PermIndexType(matrix.cols())), m_colsTranspositions(matrix.cols()), m_temp(matrix.cols()), - m_colNorms(matrix.cols()), + m_colNormsUpdated(matrix.cols()), + m_colNormsDirect(matrix.cols()), m_isInitialized(false), m_usePrescribedThreshold(false) { @@ -413,7 +416,8 @@ template class ColPivHouseholderQR PermutationType m_colsPermutation; IntRowVectorType m_colsTranspositions; RowVectorType m_temp; - RealRowVectorType m_colNorms; + RealRowVectorType m_colNormsUpdated; + RealRowVectorType m_colNormsDirect; bool m_isInitialized, m_usePrescribedThreshold; RealScalar m_prescribedThreshold, m_maxpivot; Index m_nonzero_pivots; @@ -475,12 +479,16 @@ void ColPivHouseholderQR::computeInPlace() m_colsTranspositions.resize(m_qr.cols()); Index number_of_transpositions = 0; - m_colNorms.resize(cols); - for (Index k = 0; k < cols; ++k) - m_colNorms.coeffRef(k) = m_qr.col(k).norm(); - RealRowVectorType colNormsMostRecentDirect(m_colNorms); + m_colNormsUpdated.resize(cols); + m_colNormsDirect.resize(cols); + for (Index k = 0; k < cols; ++k) { + // colNormsDirect(k) caches the most recent directly computed norm of + // column k. + m_colNormsDirect.coeffRef(k) = m_qr.col(k).norm(); + m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k); + } - RealScalar threshold_helper = numext::abs2(m_colNorms.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) @@ -488,9 +496,9 @@ void ColPivHouseholderQR::computeInPlace() for(Index k = 0; k < size; ++k) { - // first, we look up in our table m_colNorms which column has the biggest norm + // first, we look up in our table m_colNormsUpdated which column has the biggest norm Index biggest_col_index; - RealScalar biggest_col_sq_norm = numext::abs2(m_colNorms.tail(cols-k).maxCoeff(&biggest_col_index)); + RealScalar biggest_col_sq_norm = numext::abs2(m_colNormsUpdated.tail(cols-k).maxCoeff(&biggest_col_index)); biggest_col_index += k; // Track the number of meaningful pivots but do not stop the decomposition to make @@ -502,9 +510,9 @@ void ColPivHouseholderQR::computeInPlace() m_colsTranspositions.coeffRef(k) = biggest_col_index; if(k != biggest_col_index) { m_qr.col(k).swap(m_qr.col(biggest_col_index)); - std::swap(m_colNorms.coeffRef(k), m_colNorms.coeffRef(biggest_col_index)); - std::swap(colNormsMostRecentDirect.coeffRef(k), - colNormsMostRecentDirect.coeffRef(biggest_col_index)); + std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index)); + std::swap(m_colNormsDirect.coeffRef(k), + m_colNormsDirect.coeffRef(biggest_col_index)); ++number_of_transpositions; } @@ -528,20 +536,19 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNorms.coeffRef(j) != 0) { - RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNorms.coeffRef(j); + if (m_colNormsUpdated.coeffRef(j) != 0) { + RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); temp = temp < 0 ? 0 : temp; - RealScalar temp2 = - temp * numext::abs2(m_colNorms.coeffRef(j) / - colNormsMostRecentDirect.coeffRef(j)); + RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / + m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { - // The updated norm has become to inaccurate so re-compute the column + // The updated norm has become too inaccurate so re-compute the column // norm directly. - m_colNorms.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm(); - colNormsMostRecentDirect.coeffRef(j) = m_colNorms.coeffRef(j); + m_colNormsDirect.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm(); + m_colNormsUpdated.coeffRef(j) = m_colNormsDirect.coeffRef(j); } else { - m_colNorms.coeffRef(j) *= numext::sqrt(temp); + m_colNormsUpdated.coeffRef(j) *= numext::sqrt(temp); } } } -- cgit v1.2.3 From 5fb04ab2da509de116faa0f35e3038b14fd69573 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 3 Feb 2016 10:12:10 -0800 Subject: Fix bad line break. Don't repeat Kahan matrix test since it is deterministic. --- Eigen/src/QR/ColPivHouseholderQR.h | 3 +-- test/qr_colpivoting.cpp | 8 +++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index a13965ff0..efeb1f438 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -511,8 +511,7 @@ void ColPivHouseholderQR::computeInPlace() if(k != biggest_col_index) { m_qr.col(k).swap(m_qr.col(biggest_col_index)); std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index)); - std::swap(m_colNormsDirect.coeffRef(k), - m_colNormsDirect.coeffRef(biggest_col_index)); + std::swap(m_colNormsDirect.coeffRef(k), m_colNormsDirect.coeffRef(biggest_col_index)); ++number_of_transpositions; } diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 7b97292db..c777d5f94 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -212,11 +212,6 @@ void test_qr_colpivoting() CALL_SUBTEST_5(( qr_fixedsize, 1 >() )); } - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( qr_kahan_matrix() ); - CALL_SUBTEST_2( qr_kahan_matrix() ); - } - for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( qr_invertible() ); CALL_SUBTEST_2( qr_invertible() ); @@ -233,4 +228,7 @@ void test_qr_colpivoting() // Test problem size constructors CALL_SUBTEST_9(ColPivHouseholderQR(10, 20)); + + CALL_SUBTEST_1( qr_kahan_matrix() ); + CALL_SUBTEST_2( qr_kahan_matrix() ); } -- cgit v1.2.3 From 492fe7ce02f69693a53a018024920c36911aa9a5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 12:51:19 -0800 Subject: Silenced some unhelpful warnings generated by nvcc. --- Eigen/src/Core/util/DisableStupidWarnings.h | 5 +++++ Eigen/src/Core/util/ReenableStupidWarnings.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 91c61fcf2..686a7c9ce 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -20,6 +20,7 @@ #pragma warning( push ) #endif #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800) + #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) // ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body @@ -32,6 +33,7 @@ #pragma warning push #endif #pragma warning disable 2196 279 1684 2259 + #elif defined __clang__ // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant // this is really a stupid warning as it warns on compile-time expressions involving enums @@ -39,6 +41,9 @@ #pragma clang diagnostic push #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" + +#elif defined __NVCC__ + #pragma diag_suppress code_is_unreachable #endif #endif // not EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index 5ddfbd4aa..4a2e0d54a 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -8,6 +8,9 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop + #elif defined __NVCC__ + #pragma diag_warning code_is_unreachable + #pragma diag_warning initialization_not_reachable #endif #endif -- cgit v1.2.3 From d7742d22e4a1787dc16dac938b4f26601af7b488 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 13:47:28 -0800 Subject: Revert the nvcc messages to their default severity instead of the forcing them to be warnings --- Eigen/src/Core/util/ReenableStupidWarnings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index 4a2e0d54a..a45d17f9e 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -9,8 +9,8 @@ #elif defined __clang__ #pragma clang diagnostic pop #elif defined __NVCC__ - #pragma diag_warning code_is_unreachable - #pragma diag_warning initialization_not_reachable + #pragma diag_default code_is_unreachable + #pragma diag_default initialization_not_reachable #endif #endif -- cgit v1.2.3 From af8436b19694901b580ef58283a6e8b64ccde7d2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 13:48:36 -0800 Subject: Silenced the "calling a __host__ function from a __host__ __device__ function is not allowed" messages --- unsupported/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index eed724bcf..33d31c098 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -158,7 +158,7 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() - set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30") + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--diag_suppress 2651 --diag_suppress 2653\"") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") -- cgit v1.2.3 From 5d82e47ef68d0293086227d4abe7a1c05d4f4fd8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 14:10:06 -0800 Subject: Properly disable nvcc warning messages in user code. --- Eigen/src/Core/util/DisableStupidWarnings.h | 4 ++++ Eigen/src/Core/util/ReenableStupidWarnings.h | 9 +++++++-- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 686a7c9ce..d325bc062 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -44,6 +44,10 @@ #elif defined __NVCC__ #pragma diag_suppress code_is_unreachable + #pragma diag_suppress initialization_not_reachable + #pragma diag_suppress 2651 + #pragma diag_suppress 2653 + #endif #endif // not EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index a45d17f9e..ea88e226c 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -9,8 +9,13 @@ #elif defined __clang__ #pragma clang diagnostic pop #elif defined __NVCC__ - #pragma diag_default code_is_unreachable - #pragma diag_default initialization_not_reachable +// Don't reenable the diagnostic messages, as it turns out these messages need +// to be disabled at the point of the template instantiation (i.e the user code) +// otherwise they'll be triggeredby nvcc. +// #pragma diag_default code_is_unreachable +// #pragma diag_default initialization_not_reachable +// #pragma diag_default 2651 +// #pragma diag_default 2653 #endif #endif diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 33d31c098..42e0189a4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -158,7 +158,7 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() - set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--diag_suppress 2651 --diag_suppress 2653\"") + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--display_error_number\"") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") -- cgit v1.2.3 From f933f69021438af1a42f8dff9451cde0dce2a460 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 14:12:18 -0800 Subject: Added a few comments --- Eigen/src/Core/util/DisableStupidWarnings.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index d325bc062..3ed931855 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -43,8 +43,11 @@ #pragma clang diagnostic ignored "-Wconstant-logical-operand" #elif defined __NVCC__ + // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable + // Disable the "dynamic initialization in unreachable code" message #pragma diag_suppress initialization_not_reachable + // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages #pragma diag_suppress 2651 #pragma diag_suppress 2653 -- cgit v1.2.3 From bcbde37a1154ef0da3587709af888af54e8b9720 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 14:53:08 -0800 Subject: Made sure the code compiles when EIGEN_HAS_C99_MATH isn't defined --- Eigen/src/Core/SpecialFunctions.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 9f89e184d..6c6b21f98 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -134,7 +134,24 @@ struct lgamma_impl { * Implementation of digamma (psi) * ****************************************************************************/ -#ifdef EIGEN_HAS_C99_MATH +template +struct digamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else /* * @@ -202,14 +219,6 @@ struct digamma_impl_maybe_poly { } }; -#endif // EIGEN_HAS_C99_MATH - -template -struct digamma_retval { - typedef Scalar type; -}; - -#ifdef EIGEN_HAS_C99_MATH template struct digamma_impl { EIGEN_DEVICE_FUNC -- cgit v1.2.3 From 1cbb79cdfd4a43faf43f4095df456731d98961c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 15:58:26 -0800 Subject: Made sure the dummy element of size 0 array is always intialized to silence some compiler warnings --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 89aeb03e7..4df0165b9 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -165,10 +165,10 @@ template class array { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() { } + EIGEN_STRONG_INLINE array() : dummy(static_cast(0)) { } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC array(std::initializer_list l) { + EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy(static_cast(0)) { eigen_assert(l.size() == 0); } #endif -- cgit v1.2.3 From 727ff2696087fbd96815c33addc649086d31287c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 16:01:37 -0800 Subject: Disable 2 more nvcc warning messages --- Eigen/src/Core/util/DisableStupidWarnings.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 3ed931855..829b23ac8 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -47,10 +47,11 @@ #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message #pragma diag_suppress initialization_not_reachable - // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages + // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them) #pragma diag_suppress 2651 #pragma diag_suppress 2653 - + #pragma diag_suppress 2668 + #pragma diag_suppress 2670 #endif #endif // not EIGEN_WARNINGS_DISABLED -- cgit v1.2.3 From 4ab63a3f6f6b57243a6016b29361a241a47cce46 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 17:23:07 -0800 Subject: Fixed the initialization of the dummy member of the array class to make it compatible with pairs of element. --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 4df0165b9..56e2b8afc 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -165,10 +165,10 @@ template class array { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() : dummy(static_cast(0)) { } + EIGEN_STRONG_INLINE array() : dummy() { } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy(static_cast(0)) { + EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { eigen_assert(l.size() == 0); } #endif -- cgit v1.2.3 From f53537899573d8463985a33906a82e6c05a7aff9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Feb 2016 18:58:29 -0800 Subject: Added support for vectorized type casting of int to char. --- Eigen/src/Core/GenericPacketMath.h | 5 +++++ .../Eigen/CXX11/src/Tensor/TensorConversion.h | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8f63af7cb..02882bdea 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -134,6 +134,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/) { return static_cast(a); } +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { + return static_cast(a); +} /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 877bcd0df..d2defcaf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -86,6 +86,26 @@ struct PacketConverter { const TensorEvaluator& m_impl; }; +template +struct PacketConverter { + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; template struct PacketConverter { -- cgit v1.2.3 From d5d7798b9d8e1a25aa8928a7e79d36ff7d72b7d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 4 Feb 2016 09:53:47 +0100 Subject: Improve heuritics for switching between coeff-based and general matrix product implementation. --- Eigen/src/Core/GeneralProduct.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index fe8204ac3..d2290241c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -125,8 +125,8 @@ template<> struct product_type_selector { enum template<> struct product_type_selector { enum { ret = GemmProduct }; }; template<> struct product_type_selector { enum { ret = GemmProduct }; }; template<> struct product_type_selector { enum { ret = GemmProduct }; }; -template<> struct product_type_selector { enum { ret = GemmProduct }; }; -template<> struct product_type_selector { enum { ret = GemmProduct }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; template<> struct product_type_selector { enum { ret = GemmProduct }; }; } // end namespace internal -- cgit v1.2.3 From 659fc9c1593e0bfd1b886557699573873198fb61 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 4 Feb 2016 09:55:09 +0100 Subject: Remove dead code --- Eigen/src/Core/GeneralProduct.h | 26 -------------------------- Eigen/src/Core/util/ForwardDeclarations.h | 4 ---- 2 files changed, 30 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index d2290241c..0769a212e 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -76,32 +76,6 @@ public: #endif }; -// template struct product_tag -// { -// private: -// -// typedef typename remove_all::type _Lhs; -// typedef typename remove_all::type _Rhs; -// enum { -// Rows = _Lhs::RowsAtCompileTime, -// Cols = _Rhs::ColsAtCompileTime, -// Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime) -// }; -// -// enum { -// rows_select = Rows==1 ? int(Rows) : int(Large), -// cols_select = Cols==1 ? int(Cols) : int(Large), -// depth_select = Depth==1 ? int(Depth) : int(Large) -// }; -// typedef product_type_selector selector; -// -// public: -// enum { -// ret = selector::ret -// }; -// -// }; - /* The following allows to select the kind of product at compile time * based on the three dimensions of the product. * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 483af876f..31c7088e7 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -94,10 +94,6 @@ template class CwiseBinaryOp; template class Solve; template class Inverse; -namespace internal { - template struct product_tag; -} - template class Product; template class DiagonalBase; -- cgit v1.2.3 From 2e39cc40a4bb3fa704651dd2b87364f5578b3a8b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 4 Feb 2016 12:56:14 -0800 Subject: Fix condition that made the unit test spam stdout with bogus error messages. --- test/qr_colpivoting.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index c777d5f94..9c989823e 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -45,7 +45,7 @@ template void qr() RealScalar x = (std::abs)(r(i, i)); RealScalar y = (std::abs)(r(i + 1, i + 1)); if (x < threshold && y < threshold) continue; - if (test_isApproxOrLessThan(x, y)) { + if (!test_isApproxOrLessThan(y, x)) { for (Index j = 0; j < (std::min)(rows, cols); ++j) { std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; } @@ -94,7 +94,7 @@ template void qr_fixedsize() RealScalar x = (std::abs)(r(i, i)); RealScalar y = (std::abs)(r(i + 1, i + 1)); if (x < threshold && y < threshold) continue; - if (test_isApproxOrLessThan(x, y)) { + if (!test_isApproxOrLessThan(y, x)) { for (Index j = 0; j < (std::min)(int(Rows), int(Cols)); ++j) { std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; } @@ -138,7 +138,7 @@ template void qr_kahan_matrix() RealScalar x = (std::abs)(r(i, i)); RealScalar y = (std::abs)(r(i + 1, i + 1)); if (x < threshold && y < threshold) continue; - if (test_isApproxOrLessThan(x, y)) { + if (!test_isApproxOrLessThan(y, x)) { for (Index j = 0; j < (std::min)(rows, cols); ++j) { std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl; } -- cgit v1.2.3 From 62a1c911cd1fca97c381ac5cb21d0345934148a4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 5 Feb 2016 21:24:35 +0100 Subject: Remove posix_memalign, _mm_malloc, and _aligned_malloc special paths. --- Eigen/src/Core/util/Memory.h | 110 +++-------------------------------------- doc/PreprocessorDirectives.dox | 3 -- 2 files changed, 8 insertions(+), 105 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 415bc48cb..84fb0516c 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -59,28 +59,6 @@ #endif -#ifndef EIGEN_HAS_POSIX_MEMALIGN - // See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554) - // It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first. - // Currently, let's include it only on unix systems: - #if EIGEN_OS_UNIX && !(EIGEN_OS_SUN || EIGEN_OS_SOLARIS) - #include - #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0) - #define EIGEN_HAS_POSIX_MEMALIGN 1 - #endif - #endif - - #ifndef EIGEN_HAS_POSIX_MEMALIGN - #define EIGEN_HAS_POSIX_MEMALIGN 0 - #endif -#endif - -#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX - #define EIGEN_HAS_MM_MALLOC 1 -#else - #define EIGEN_HAS_MM_MALLOC 0 -#endif - namespace Eigen { namespace internal { @@ -122,7 +100,7 @@ inline void handmade_aligned_free(void *ptr) /** \internal * \brief Reallocates aligned memory. - * Since we know that our handmade version is based on std::realloc + * Since we know that our handmade version is based on std::malloc * we can use std::realloc to implement efficient reallocation. */ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0) @@ -141,47 +119,6 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = return aligned; } -/***************************************************************************** -*** Implementation of generic aligned realloc (when no realloc can be used)*** -*****************************************************************************/ - -EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); -EIGEN_DEVICE_FUNC void aligned_free(void *ptr); - -/** \internal - * \brief Reallocates aligned memory. - * Allows reallocation with aligned ptr types. This implementation will - * always create a new memory chunk and copy the old data. - */ -inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) -{ - if (ptr==0) - return aligned_malloc(size); - - if (size==0) - { - aligned_free(ptr); - return 0; - } - - void* newptr = aligned_malloc(size); - if (newptr == 0) - { - #ifdef EIGEN_HAS_ERRNO - errno = ENOMEM; // according to the standard - #endif - return 0; - } - - if (ptr != 0) - { - std::memcpy(newptr, ptr, (std::min)(size,old_size)); - aligned_free(ptr); - } - - return newptr; -} - /***************************************************************************** *** Implementation of portable aligned versions of malloc/free/realloc *** *****************************************************************************/ @@ -218,16 +155,8 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) check_that_malloc_is_allowed(); void *result; - #if EIGEN_DEFAULT_ALIGN_BYTES==0 - result = std::malloc(size); - #elif EIGEN_MALLOC_ALREADY_ALIGNED + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); - #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, EIGEN_DEFAULT_ALIGN_BYTES, size)) result = 0; - #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES); - #elif EIGEN_OS_WIN_STRICT - result = _aligned_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES); #else result = handmade_aligned_malloc(size); #endif @@ -241,48 +170,25 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) /** \internal Frees memory allocated with aligned_malloc. */ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { - #if EIGEN_DEFAULT_ALIGN_BYTES==0 - std::free(ptr); - #elif EIGEN_MALLOC_ALREADY_ALIGNED - std::free(ptr); - #elif EIGEN_HAS_POSIX_MEMALIGN + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED std::free(ptr); - #elif EIGEN_HAS_MM_MALLOC - _mm_free(ptr); - #elif EIGEN_OS_WIN_STRICT - _aligned_free(ptr); #else handmade_aligned_free(ptr); #endif } /** -* \internal -* \brief Reallocates an aligned block of memory. -* \throws std::bad_alloc on allocation failure -**/ + * \internal + * \brief Reallocates an aligned block of memory. + * \throws std::bad_alloc on allocation failure + */ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) { EIGEN_UNUSED_VARIABLE(old_size); void *result; -#if EIGEN_DEFAULT_ALIGN_BYTES==0 - result = std::realloc(ptr,new_size); -#elif EIGEN_MALLOC_ALREADY_ALIGNED +#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED result = std::realloc(ptr,new_size); -#elif EIGEN_HAS_POSIX_MEMALIGN - result = generic_aligned_realloc(ptr,new_size,old_size); -#elif EIGEN_HAS_MM_MALLOC - // The defined(_mm_free) is just here to verify that this MSVC version - // implements _mm_malloc/_mm_free based on the corresponding _aligned_ - // functions. This may not always be the case and we just try to be safe. - #if EIGEN_OS_WIN_STRICT && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES); - #else - result = generic_aligned_realloc(ptr,new_size,old_size); - #endif -#elif EIGEN_OS_WIN_STRICT - result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index 7cde1a36f..14e84bc20 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -87,9 +87,6 @@ run time. However, these assertions do cost time and can thus be turned off. - \b EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. - - \b EIGEN_HAS_POSIX_MEMALIGN - defines whether aligned memory allocation can be performed through the \c posix_memalign - function. The availability of \c posix_memalign is automatically checked on most platform, but this option allows to - by-pass %Eigen's built-in rules. \section TopicPreprocessorDirectivesPlugins Plugins -- cgit v1.2.3 From e8e1d504d6cbeb47c33169a36c052aa21bea32ee Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 5 Feb 2016 21:38:16 +0100 Subject: Add an explicit assersion on the alignment of the pointer returned by std::malloc --- Eigen/src/Core/util/Memory.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 84fb0516c..01513a59e 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -157,6 +157,9 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + #endif #else result = handmade_aligned_malloc(size); #endif -- cgit v1.2.3 From 5b2d287878d4f049c1ba6c55c1fcaac1129d6df0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 5 Feb 2016 21:46:39 +0100 Subject: bug #779: allow non aligned buffers for buffers smaller than the requested alignment. --- Eigen/src/Core/MapBase.h | 3 ++- test/dynalloc.cpp | 6 +++--- test/mapped_matrix.cpp | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 75a80daaa..afa47540e 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -165,7 +165,8 @@ template class MapBase void checkSanity() const { #if EIGEN_MAX_ALIGN_BYTES>0 - eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits::Alignment)) == 0) && "data is not aligned"); + eigen_assert(( ((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits::Alignment)) == 0) + || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index 6f22e1ab4..5f587007c 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -31,7 +31,7 @@ void check_handmade_aligned_malloc() void check_aligned_malloc() { - for(int i = 1; i < 1000; i++) + for(int i = ALIGNMENT; i < 1000; i++) { char *p = (char*)internal::aligned_malloc(i); VERIFY(size_t(p)%ALIGNMENT==0); @@ -43,7 +43,7 @@ void check_aligned_malloc() void check_aligned_new() { - for(int i = 1; i < 1000; i++) + for(int i = ALIGNMENT; i < 1000; i++) { float *p = internal::aligned_new(i); VERIFY(size_t(p)%ALIGNMENT==0); @@ -55,7 +55,7 @@ void check_aligned_new() void check_aligned_stack_alloc() { - for(int i = 1; i < 400; i++) + for(int i = ALIGNMENT; i < 400; i++) { ei_declare_aligned_stack_constructed_variable(float,p,i,0); VERIFY(size_t(p)%ALIGNMENT==0); diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp index 7c7099792..88653e887 100644 --- a/test/mapped_matrix.cpp +++ b/test/mapped_matrix.cpp @@ -40,7 +40,7 @@ template void map_class_vector(const VectorType& m) VERIFY_IS_EQUAL(ma1, ma3); VERIFY_IS_EQUAL(ma1, ma4); #ifdef EIGEN_VECTORIZE - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && size>=AlignedMax) VERIFY_RAISES_ASSERT((Map(array3unaligned, size))) #endif -- cgit v1.2.3 From fb00a4af2b28ec0b92c072ab059934a673dc2454 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 6 Feb 2016 01:42:14 +0000 Subject: Made the tensor fft test compile on tegra x1 --- unsupported/test/cxx11_tensor_fft.cpp | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 0f6e09106..b2a4b5a96 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -14,7 +14,7 @@ using Eigen::Tensor; template static void test_fft_2D_golden() { - Tensor input(2, 3); + Tensor input(2, 3); input(0, 0) = 1; input(0, 1) = 2; input(0, 2) = 3; @@ -22,11 +22,11 @@ static void test_fft_2D_golden() { input(1, 1) = 5; input(1, 2) = 6; - array fft; + array fft; fft[0] = 0; fft[1] = 1; - Tensor, 2, DataLayout, long> output = input.template fft(fft); + Tensor, 2, DataLayout> output = input.template fft(fft); std::complex output_golden[6]; // in ColMajor order output_golden[0] = std::complex(21, 0); @@ -57,24 +57,24 @@ static void test_fft_2D_golden() { } static void test_fft_complex_input_golden() { - Tensor, 1, ColMajor, long> input(5); + Tensor, 1, ColMajor> input(5); input(0) = std::complex(1, 1); input(1) = std::complex(2, 2); input(2) = std::complex(3, 3); input(3) = std::complex(4, 4); input(4) = std::complex(5, 5); - array fft; + array fft; fft[0] = 0; - Tensor, 1, ColMajor, long> forward_output_both_parts = input.fft(fft); - Tensor, 1, ColMajor, long> reverse_output_both_parts = input.fft(fft); + Tensor, 1, ColMajor> forward_output_both_parts = input.fft(fft); + Tensor, 1, ColMajor> reverse_output_both_parts = input.fft(fft); - Tensor forward_output_real_part = input.fft(fft); - Tensor reverse_output_real_part = input.fft(fft); + Tensor forward_output_real_part = input.fft(fft); + Tensor reverse_output_real_part = input.fft(fft); - Tensor forward_output_imag_part = input.fft(fft); - Tensor reverse_output_imag_part = input.fft(fft); + Tensor forward_output_imag_part = input.fft(fft); + Tensor reverse_output_imag_part = input.fft(fft); VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0)); VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0)); @@ -114,24 +114,24 @@ static void test_fft_complex_input_golden() { } static void test_fft_real_input_golden() { - Tensor input(5); + Tensor input(5); input(0) = 1.0; input(1) = 2.0; input(2) = 3.0; input(3) = 4.0; input(4) = 5.0; - array fft; + array fft; fft[0] = 0; - Tensor, 1, ColMajor, long> forward_output_both_parts = input.fft(fft); - Tensor, 1, ColMajor, long> reverse_output_both_parts = input.fft(fft); + Tensor, 1, ColMajor> forward_output_both_parts = input.fft(fft); + Tensor, 1, ColMajor> reverse_output_both_parts = input.fft(fft); - Tensor forward_output_real_part = input.fft(fft); - Tensor reverse_output_real_part = input.fft(fft); + Tensor forward_output_real_part = input.fft(fft); + Tensor reverse_output_real_part = input.fft(fft); - Tensor forward_output_imag_part = input.fft(fft); - Tensor reverse_output_imag_part = input.fft(fft); + Tensor forward_output_imag_part = input.fft(fft); + Tensor reverse_output_imag_part = input.fft(fft); VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0)); VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0)); @@ -178,17 +178,17 @@ static void test_fft_real_input_golden() { template static void test_fft_real_input_energy() { - Eigen::DSizes dimensions; - int total_size = 1; + Eigen::DSizes dimensions; + ptrdiff_t total_size = 1; for (int i = 0; i < TensorRank; ++i) { dimensions[i] = rand() % 20 + 1; total_size *= dimensions[i]; } - const DSizes arr = dimensions; + const DSizes arr = dimensions; typedef typename internal::conditional, RealScalar>::type InputScalar; - Tensor input; + Tensor input; input.resize(arr); input.setRandom(); -- cgit v1.2.3 From d2cba520152fadcddb481142ba2250b5bbf0ad81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 5 Feb 2016 18:14:23 -0800 Subject: Only enable the cxx11_tensor_uint128 test on 64 bit machines since 32 bit systems don't support the __uin128_t type --- unsupported/test/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 42e0189a4..c202cf0e4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -117,7 +117,6 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_complex "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") - ei_add_test(cxx11_tensor_uint128 "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") @@ -149,6 +148,11 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_ifft "-std=c++0x") ei_add_test(cxx11_tensor_empty "-std=c++0x") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + # This test requires __uint128_t which is only available on 64bit systems + ei_add_test(cxx11_tensor_uint128 "-std=c++0x") + endif() + endif() # These tests needs nvcc -- cgit v1.2.3 From 4d4211c04ebded41109a672002401cc65ef2d3a0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 5 Feb 2016 18:19:41 -0800 Subject: Avoid unecessary type conversions --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 2 +- unsupported/test/cxx11_tensor_fft.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 3bfaf6d23..9e675ad0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -206,7 +206,7 @@ struct TensorEvaluator, D } for (size_t i = 0; i < m_fft.size(); ++i) { - int dim = m_fft[i]; + Index dim = m_fft[i]; eigen_assert(dim >= 0 && dim < NumDims); Index line_len = m_dimensions[dim]; eigen_assert(line_len >= 1); diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index b2a4b5a96..89874349f 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -192,7 +192,7 @@ static void test_fft_real_input_energy() { input.resize(arr); input.setRandom(); - array fft; + array fft; for (int i = 0; i < TensorRank; ++i) { fft[i] = i; } -- cgit v1.2.3 From c6a12d1dc61cb38f5c9d48241d30cd99c9b99fd2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 6 Feb 2016 18:06:51 +0100 Subject: Fix warning with gcc < 4.8 --- Eigen/src/Core/MapBase.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index afa47540e..12c464a5a 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -130,7 +130,7 @@ template class MapBase explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - checkSanity(); + checkSanity(); } EIGEN_DEVICE_FUNC @@ -142,7 +142,7 @@ template class MapBase EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) eigen_assert(vecSize >= 0); eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize); - checkSanity(); + checkSanity(); } EIGEN_DEVICE_FUNC @@ -152,7 +152,7 @@ template class MapBase eigen_assert( (dataPtr == 0) || ( rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols))); - checkSanity(); + checkSanity(); } #ifdef EIGEN_MAPBASE_PLUGIN @@ -161,15 +161,21 @@ template class MapBase protected: + template EIGEN_DEVICE_FUNC - void checkSanity() const + void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 - eigen_assert(( ((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits::Alignment)) == 0) + eigen_assert(( ((size_t(m_data) % internal::traits::Alignment) == 0) || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } + template + EIGEN_DEVICE_FUNC + void checkSanity(typename internal::enable_if::Alignment==0,void*>::type = 0) const + {} + PointerType m_data; const internal::variable_if_dynamic m_rows; const internal::variable_if_dynamic m_cols; -- cgit v1.2.3 From 8e599bc098cef7030004489dca2b9dab920f2cc8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 6 Feb 2016 20:26:59 +0100 Subject: Fix warning in unit test --- test/zerosized.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zerosized.cpp b/test/zerosized.cpp index 85c553453..2404fdc2b 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -49,8 +49,8 @@ template void zeroSizedMatrix() if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0) { - Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::RowsAtCompileTime; - Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::ColsAtCompileTime; + Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random(1,10) : Index(MatrixType::RowsAtCompileTime); + Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random(1,10) : Index(MatrixType::ColsAtCompileTime); MatrixType m(rows,cols); zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols)); zeroReduction(m.template block(0,0,rows,0)); -- cgit v1.2.3 From 010afe1619c9200b885f74d3c3937e000ec76b1d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 6 Feb 2016 22:49:18 +0100 Subject: Add exemples for reshaping/slicing with Map. --- doc/Manual.dox | 2 + doc/TutorialReshapeSlicing.dox | 65 ++++++++++++++++++++++++++++++++ doc/snippets/Tutorial_ReshapeMat2Mat.cpp | 6 +++ doc/snippets/Tutorial_ReshapeMat2Vec.cpp | 11 ++++++ doc/snippets/Tutorial_SlicingCol.cpp | 11 ++++++ doc/snippets/Tutorial_SlicingVec.cpp | 4 ++ 6 files changed, 99 insertions(+) create mode 100644 doc/TutorialReshapeSlicing.dox create mode 100644 doc/snippets/Tutorial_ReshapeMat2Mat.cpp create mode 100644 doc/snippets/Tutorial_ReshapeMat2Vec.cpp create mode 100644 doc/snippets/Tutorial_SlicingCol.cpp create mode 100644 doc/snippets/Tutorial_SlicingVec.cpp diff --git a/doc/Manual.dox b/doc/Manual.dox index c10c490a7..70aaa9a42 100644 --- a/doc/Manual.dox +++ b/doc/Manual.dox @@ -59,6 +59,8 @@ namespace Eigen { \ingroup DenseMatrixManipulation_chapter */ /** \addtogroup TutorialMapClass \ingroup DenseMatrixManipulation_chapter */ +/** \addtogroup TutorialReshapeSlicing + \ingroup DenseMatrixManipulation_chapter */ /** \addtogroup TopicAliasing \ingroup DenseMatrixManipulation_chapter */ /** \addtogroup TopicStorageOrders diff --git a/doc/TutorialReshapeSlicing.dox b/doc/TutorialReshapeSlicing.dox new file mode 100644 index 000000000..eb0fb0df0 --- /dev/null +++ b/doc/TutorialReshapeSlicing.dox @@ -0,0 +1,65 @@ +namespace Eigen { + +/** \eigenManualPage TutorialReshapeSlicing Reshape and Slicing + +%Eigen does not expose convenient methods to take slices or to reshape a matrix yet. +Nonetheless, such features can easily be emulated using the Map class. + +\eigenAutoToc + +\section TutorialReshape Reshape + +A reshape operation consists in modifying the sizes of a matrix while keeping the same coefficients. +Instead of modifying the input matrix itself, which is not possible for compile-time sizes, the approach consist in creating a different \em view on the storage using class Map. +Here is a typical example creating a 1D linear view of a matrix: + + + + +
Example:Output:
+\include Tutorial_ReshapeMat2Vec.cpp + +\verbinclude Tutorial_ReshapeMat2Vec.out +
+ +Remark how the storage order of the input matrix modifies the order of the coefficients in the linear view. +Here is another example reshaping a 2x6 matrix to a 6x2 one: + + + +
Example:Output:
+\include Tutorial_ReshapeMat2Mat.cpp + +\verbinclude Tutorial_ReshapeMat2Mat.out +
+ + + +\section TutorialSlicing Slicing + +Slicing consists in taking a set of rows, or columns, or elements, uniformly spaced within a matrix. +Again, the class Map allows to easily mimic this feature. + +For instance, one can take skip every P elements in a vector: + + + +
Example:Output:
+\include Tutorial_SlicingVec.cpp + +\verbinclude Tutorial_SlicingVec.out +
+ +One can olso take one column over three using an adequate outer-stride or inner-stride depending on the actual storage order: + + + +
Example:Output:
+\include Tutorial_SlicingCol.cpp + +\verbinclude Tutorial_SlicingCol.out +
+ +*/ + +} diff --git a/doc/snippets/Tutorial_ReshapeMat2Mat.cpp b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp new file mode 100644 index 000000000..f84d6e76d --- /dev/null +++ b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp @@ -0,0 +1,6 @@ +MatrixXf M1(2,6); // Column-major storage +M1 << 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12; + +Map M2(M1.data(), 6,2); +cout << "M2:" << endl << M2 << endl; \ No newline at end of file diff --git a/doc/snippets/Tutorial_ReshapeMat2Vec.cpp b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp new file mode 100644 index 000000000..95bd4e0e6 --- /dev/null +++ b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp @@ -0,0 +1,11 @@ +MatrixXf M1(3,3); // Column-major storage +M1 << 1, 2, 3, + 4, 5, 6, + 7, 8, 9; + +Map v1(M1.data(), M1.size()); +cout << "v1:" << endl << v1 << endl; + +Matrix M2(M1); +Map v2(M2.data(), M2.size()); +cout << "v2:" << endl << v2 << endl; \ No newline at end of file diff --git a/doc/snippets/Tutorial_SlicingCol.cpp b/doc/snippets/Tutorial_SlicingCol.cpp new file mode 100644 index 000000000..f667ff689 --- /dev/null +++ b/doc/snippets/Tutorial_SlicingCol.cpp @@ -0,0 +1,11 @@ +MatrixXf M1 = MatrixXf::Random(3,8); +cout << "Column major input:" << endl << M1 << "\n"; +Map > M2(M1.data(), M1.rows(), (M1.cols()+2)/3, OuterStride<>(M1.outerStride()*3)); +cout << "1 column over 3:" << endl << M2 << "\n"; + +typedef Matrix RowMajorMatrixXf; +RowMajorMatrixXf M3(M1); +cout << "Row major input:" << endl << M3 << "\n"; +Map > M4(M3.data(), M3.rows(), (M3.cols()+2)/3, + Stride(M3.outerStride(),3)); +cout << "1 column over 3:" << endl << M4 << "\n"; \ No newline at end of file diff --git a/doc/snippets/Tutorial_SlicingVec.cpp b/doc/snippets/Tutorial_SlicingVec.cpp new file mode 100644 index 000000000..07e10bf69 --- /dev/null +++ b/doc/snippets/Tutorial_SlicingVec.cpp @@ -0,0 +1,4 @@ +RowVectorXf v = RowVectorXf::LinSpaced(20,0,19); +cout << "Input:" << endl << v << endl; +Map > v2(v.data(), v.size()/2); +cout << "Even:" << v2 << endl; \ No newline at end of file -- cgit v1.2.3 From d904c8ac8f5963b0c6f2528f4ee1823350277713 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 6 Feb 2016 16:32:00 -0800 Subject: Implement complete orthogonal decomposition in Eigen. --- Eigen/QR | 1 + Eigen/src/Core/MatrixBase.h | 25 +- Eigen/src/Core/util/ForwardDeclarations.h | 3 +- Eigen/src/QR/ColPivHouseholderQR.h | 2 + Eigen/src/QR/CompleteOrthogonalDecomposition.h | 538 +++++++++++++++++++++++++ test/qr_colpivoting.cpp | 77 ++++ 6 files changed, 633 insertions(+), 13 deletions(-) create mode 100644 Eigen/src/QR/CompleteOrthogonalDecomposition.h diff --git a/Eigen/QR b/Eigen/QR index f74f365f1..25c781cc1 100644 --- a/Eigen/QR +++ b/Eigen/QR @@ -34,6 +34,7 @@ #include "src/QR/HouseholderQR.h" #include "src/QR/FullPivHouseholderQR.h" #include "src/QR/ColPivHouseholderQR.h" +#include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE #include "src/QR/HouseholderQR_MKL.h" #include "src/QR/ColPivHouseholderQR_MKL.h" diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 3770ab257..1e66b4e1b 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -66,7 +66,7 @@ template class MatrixBase using Base::MaxSizeAtCompileTime; using Base::IsVectorAtCompileTime; using Base::Flags; - + using Base::derived; using Base::const_cast_derived; using Base::rows; @@ -175,7 +175,7 @@ template class MatrixBase #endif template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC const Product lazyProduct(const MatrixBase &other) const; @@ -214,7 +214,7 @@ template class MatrixBase typedef Diagonal DiagonalReturnType; EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - + typedef typename internal::add_const >::type ConstDiagonalReturnType; EIGEN_DEVICE_FUNC ConstDiagonalReturnType diagonal() const; @@ -222,14 +222,14 @@ template class MatrixBase template struct DiagonalIndexReturnType { typedef Diagonal Type; }; template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; - template + template EIGEN_DEVICE_FUNC typename DiagonalIndexReturnType::Type diagonal(); template EIGEN_DEVICE_FUNC typename ConstDiagonalIndexReturnType::Type diagonal() const; - + typedef Diagonal DiagonalDynamicIndexReturnType; typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; @@ -251,7 +251,7 @@ template class MatrixBase template struct SelfAdjointViewReturnType { typedef SelfAdjointView Type; }; template struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView Type; }; - template + template EIGEN_DEVICE_FUNC typename SelfAdjointViewReturnType::Type selfadjointView(); template @@ -340,7 +340,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline const Inverse inverse() const; - + template inline void computeInverseAndDetWithCheck( ResultType& inverse, @@ -366,6 +366,7 @@ template class MatrixBase inline const HouseholderQR householderQr() const; inline const ColPivHouseholderQR colPivHouseholderQr() const; inline const FullPivHouseholderQR fullPivHouseholderQr() const; + inline const CompleteOrthogonalDecomposition completeOrthogonalDecomposition() const; /////////// Eigenvalues module /////////// @@ -394,23 +395,23 @@ template class MatrixBase inline PlainObject #endif cross(const MatrixBase& other) const; - + template EIGEN_DEVICE_FUNC inline PlainObject cross3(const MatrixBase& other) const; - + EIGEN_DEVICE_FUNC inline PlainObject unitOrthogonal(void) const; - + inline Matrix eulerAngles(Index a0, Index a1, Index a2) const; - + inline ScalarMultipleReturnType operator*(const UniformScaling& s) const; // put this as separate enum value to work around possible GCC 4.3 bug (?) enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical) : ColsAtCompileTime==1 ? Vertical : Horizontal }; typedef Homogeneous HomogeneousReturnType; inline HomogeneousReturnType homogeneous() const; - + enum { SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1 }; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 31c7088e7..f09632375 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -95,7 +95,7 @@ template class Solve; template class Inverse; template class Product; - + template class DiagonalBase; template class DiagonalWrapper; template class DiagonalMatrix; @@ -248,6 +248,7 @@ template struct inverse_impl; template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; +template class CompleteOrthogonalDecomposition; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index efeb1f438..7c559f952 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -404,6 +404,8 @@ template class ColPivHouseholderQR protected: + friend class CompleteOrthogonalDecomposition; + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h new file mode 100644 index 000000000..b81bb7433 --- /dev/null +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -0,0 +1,538 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H +#define EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H + +namespace Eigen { + +namespace internal { +template +struct traits > + : traits<_MatrixType> { + enum { Flags = 0 }; +}; + +} // end namespace internal + +/** \ingroup QR_Module + * + * \class CompleteOrthogonalDecomposition + * + * \brief Complete orthogonal decomposition (COD) of a matrix. + * + * \param MatrixType the type of the matrix of which we are computing the COD. + * + * This class performs a rank-revealing complete ortogonal decomposition of a + * matrix \b A into matrices \b P, \b Q, \b T, and \b Z such that + * \f[ + * \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \begin{matrix} \mathbf{T} & + * \mathbf{0} \\ \mathbf{0} & \mathbf{0} \end{matrix} \, \mathbf{Z} + * \f] + * by using Householder transformations. Here, \b P is a permutation matrix, + * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of + * size rank-by-rank. \b A may be rank deficient. + * + * \sa MatrixBase::completeOrthogonalDecomposition() + */ +template +class CompleteOrthogonalDecomposition { + public: + typedef _MatrixType MatrixType; + enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + Options = MatrixType::Options, + MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix + MatrixQType; + typedef typename internal::plain_diag_type::type HCoeffsType; + typedef PermutationMatrix + PermutationType; + typedef typename internal::plain_row_type::type + IntRowVectorType; + typedef typename internal::plain_row_type::type RowVectorType; + typedef typename internal::plain_row_type::type + RealRowVectorType; + typedef HouseholderSequence< + MatrixType, typename internal::remove_all< + typename HCoeffsType::ConjugateReturnType>::type> + HouseholderSequenceType; + + private: + typedef typename PermutationType::Index PermIndexType; + + public: + /** + * \brief Default Constructor. + * + * The default constructor is useful in cases in which the user intends to + * perform decompositions via + * \c CompleteOrthogonalDecomposition::compute(const* MatrixType&). + */ + CompleteOrthogonalDecomposition() : m_cpqr(), m_zCoeffs(), m_temp() {} + + /** \brief Default Constructor with memory preallocation + * + * Like the default constructor but with preallocation of the internal data + * according to the specified problem \a size. + * \sa CompleteOrthogonalDecomposition() + */ + CompleteOrthogonalDecomposition(Index rows, Index cols) + : m_cpqr(rows, cols), m_zCoeffs((std::min)(rows, cols)), m_temp(cols) {} + + /** \brief Constructs a complete orthogonal decomposition from a given + * matrix. + * + * This constructor computes the complete orthogonal decomposition of the + * matrix \a matrix by calling the method compute(). The default + * threshold for rank determination will be used. It is a short cut for: + * + * \code + * CompleteOrthogonalDecomposition cod(matrix.rows(), + * matrix.cols()); + * cod.setThreshold(Default); + * cod.compute(matrix); + * \endcode + * + * \sa compute() + */ + template + explicit CompleteOrthogonalDecomposition(const EigenBase& matrix) + : m_cpqr(matrix.rows(), matrix.cols()), + m_zCoeffs((std::min)(matrix.rows(), matrix.cols())), + m_temp(matrix.cols()) { + compute(matrix.derived()); + } + + /** This method computes the minimum-norm solution X to a least squares + * problem \f[\mathrm{minimize} ||A X - B|| \f], where \b A is the matrix of + * which \c *this is the complete orthogonal decomposition. + * + * \param B the right-hand sides of the problem to solve. + * + * \returns a solution. + * + */ + template + inline const Solve solve( + const MatrixBase& b) const { + eigen_assert(m_cpqr.m_isInitialized && + "CompleteOrthogonalDecomposition is not initialized."); + return Solve(*this, b.derived()); + } + + HouseholderSequenceType householderQ(void) const; + HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } + + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. + */ + template + void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const; + + /** \returns the matrix \b Z. + */ + MatrixType matrixZ() const { + MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); + applyZAdjointOnTheLeftInPlace(Z); + return Z.adjoint(); + } + + /** \returns a reference to the matrix where the complete orthogonal + * decomposition is stored + */ + const MatrixType& matrixQTZ() const { return m_cpqr.matrixQR(); } + + /** \returns a reference to the matrix where the complete orthogonal + * decomposition is stored. + * \warning The strict lower part and \code cols() - rank() \endcode right + * columns of this matrix contains internal values. + * Only the upper triangular part should be referenced. To get it, use + * \code matrixT().template triangularView() \endcode + * For rank-deficient matrices, use + * \code + * matrixR().topLeftCorner(rank(), rank()).template triangularView() + * \endcode + */ + const MatrixType& matrixT() const { return m_cpqr.matrixQR(); } + + template + CompleteOrthogonalDecomposition& compute(const EigenBase& matrix); + + /** \returns a const reference to the column permutation matrix */ + const PermutationType& colsPermutation() const { + return m_cpqr.colsPermutation(); + } + + /** \returns the absolute value of the determinant of the matrix of which + * *this is the complete orthogonal decomposition. It has only linear + * complexity (that is, O(n) where n is the dimension of the square matrix) + * as the complete orthogonal decomposition has already been computed. + * + * \note This is only for square matrices. + * + * \warning a determinant can be very big or small, so for matrices + * of large enough dimension, there is a risk of overflow/underflow. + * One way to work around that is to use logAbsDeterminant() instead. + * + * \sa logAbsDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::RealScalar absDeterminant() const; + + /** \returns the natural log of the absolute value of the determinant of the + * matrix of which *this is the complete orthogonal decomposition. It has + * only linear complexity (that is, O(n) where n is the dimension of the + * square matrix) as the complete orthogonal decomposition has already been + * computed. + * + * \note This is only for square matrices. + * + * \note This method is useful to work around the risk of overflow/underflow + * that's inherent to determinant computation. + * + * \sa absDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::RealScalar logAbsDeterminant() const; + + /** \returns the rank of the matrix of which *this is the complete orthogonal + * decomposition. + * + * \note This method has to determine which pivots should be considered + * nonzero. For that, it uses the threshold value that you can control by + * calling setThreshold(const RealScalar&). + */ + inline Index rank() const { return m_cpqr.rank(); } + + /** \returns the dimension of the kernel of the matrix of which *this is the + * complete orthogonal decomposition. + * + * \note This method has to determine which pivots should be considered + * nonzero. For that, it uses the threshold value that you can control by + * calling setThreshold(const RealScalar&). + */ + inline Index dimensionOfKernel() const { return m_cpqr.dimensionOfKernel(); } + + /** \returns true if the matrix of which *this is the decomposition represents + * an injective linear map, i.e. has trivial kernel; false otherwise. + * + * \note This method has to determine which pivots should be considered + * nonzero. For that, it uses the threshold value that you can control by + * calling setThreshold(const RealScalar&). + */ + inline bool isInjective() const { return m_cpqr.isInjective(); } + + /** \returns true if the matrix of which *this is the decomposition represents + * a surjective linear map; false otherwise. + * + * \note This method has to determine which pivots should be considered + * nonzero. For that, it uses the threshold value that you can control by + * calling setThreshold(const RealScalar&). + */ + inline bool isSurjective() const { return m_cpqr.isSurjective(); } + + /** \returns true if the matrix of which *this is the complete orthogonal + * decomposition is invertible. + * + * \note This method has to determine which pivots should be considered + * nonzero. For that, it uses the threshold value that you can control by + * calling setThreshold(const RealScalar&). + */ + inline bool isInvertible() const { return m_cpqr.isInvertible(); } + + /** \returns the inverse of the matrix of which *this is the complete + * orthogonal decomposition. + * + * \note If this matrix is not invertible, the returned matrix has undefined + * coefficients. Use isInvertible() to first determine whether this matrix is + * invertible. + */ + + // TODO(rmlarsen): Add method for pseudo-inverse. + // inline const + // internal::solve_retval + // inverse() const + // { + // eigen_assert(m_isInitialized && "CompleteOrthogonalDecomposition is not + // initialized."); + // return internal::solve_retval + // (*this, MatrixType::Identity(m_cpqr.rows(), m_cpqr.cols())); + // } + + inline Index rows() const { return m_cpqr.rows(); } + inline Index cols() const { return m_cpqr.cols(); } + + /** \returns a const reference to the vector of Householder coefficients used + * to represent the factor \c Q. + * + * For advanced uses only. + */ + inline const HCoeffsType& hCoeffs() const { return m_cpqr.hCoeffs(); } + + /** \returns a const reference to the vector of Householder coefficients + * used to represent the factor \c Z. + * + * For advanced uses only. + */ + const HCoeffsType& zCoeffs() const { return m_zCoeffs; } + + /** Allows to prescribe a threshold to be used by certain methods, such as + * rank(), who need to determine when pivots are to be considered nonzero. + * Most be called before calling compute(). + * + * When it needs to get the threshold value, Eigen calls threshold(). By + * default, this uses a formula to automatically determine a reasonable + * threshold. Once you have called the present method + * setThreshold(const RealScalar&), your value is used instead. + * + * \param threshold The new value to use as the threshold. + * + * A pivot will be considered nonzero if its absolute value is strictly + * greater than + * \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$ + * where maxpivot is the biggest pivot. + * + * If you want to come back to the default behavior, call + * setThreshold(Default_t) + */ + CompleteOrthogonalDecomposition& setThreshold(const RealScalar& threshold) { + m_cpqr.setThreshold(threshold); + return *this; + } + + /** Allows to come back to the default behavior, letting Eigen use its default + * formula for determining the threshold. + * + * You should pass the special object Eigen::Default as parameter here. + * \code qr.setThreshold(Eigen::Default); \endcode + * + * See the documentation of setThreshold(const RealScalar&). + */ + CompleteOrthogonalDecomposition& setThreshold(Default_t) { + m_cpqr.setThreshold(Default); + return *this; + } + + /** Returns the threshold that will be used by certain methods such as rank(). + * + * See the documentation of setThreshold(const RealScalar&). + */ + RealScalar threshold() const { return m_cpqr.threshold(); } + + /** \returns the number of nonzero pivots in the complete orthogonal + * decomposition. + * Here nonzero is meant in the exact sense, not in a fuzzy sense. + * So that notion isn't really intrinsically interesting, but it is + * still useful when implementing algorithms. + * + * \sa rank() + */ + inline Index nonzeroPivots() const { return m_cpqr.nonzeroPivots(); } + + /** \returns the absolute value of the biggest pivot, i.e. the biggest + * diagonal coefficient of R. + */ + inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } + + /** \brief Reports whether the complete orthogonal decomposition was + * succesful. + * + * \note This function always returns \c Success. It is provided for + * compatibility + * with other factorization routines. + * \returns \c Success + */ + ComputationInfo info() const { + eigen_assert(m_cpqr.m_isInitialized && "Decomposition is not initialized."); + return Success; + } + +#ifndef EIGEN_PARSED_BY_DOXYGEN + template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; +#endif + + protected: + static void check_template_parameters() { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + + ColPivHouseholderQR m_cpqr; + HCoeffsType m_zCoeffs; + RowVectorType m_temp; +}; + +template +typename MatrixType::RealScalar +CompleteOrthogonalDecomposition::absDeterminant() const { + return m_cpqr.absDeterminant(); +} + +template +typename MatrixType::RealScalar +CompleteOrthogonalDecomposition::logAbsDeterminant() const { + return m_cpqr.logAbsDeterminant(); +} + +/** Performs the complete orthogonal decomposition of the given matrix \a + * matrix. The result of the factorization is stored into \c *this, and a + * reference to \c *this is returned. + * + * \sa class CompleteOrthogonalDecomposition, + * CompleteOrthogonalDecomposition(const MatrixType&) + */ +template +template +CompleteOrthogonalDecomposition& CompleteOrthogonalDecomposition< + MatrixType>::compute(const EigenBase& matrix) { + check_template_parameters(); + + // the column permutation is stored as int indices, so just to be sure: + eigen_assert(matrix.cols() <= NumTraits::highest()); + + // Compute the column pivoted QR factorization A P = Q R. + m_cpqr.compute(matrix); + + const Index rank = m_cpqr.rank(); + const Index cols = matrix.cols(); + if (rank < cols) { + // We have reduced the (permuted) matrix to the form + // [R11 R12] + // [ 0 R22] + // where R11 is r-by-r (r = rank) upper triangular, R12 is + // r-by-(n-r), and R22 is empty or the norm of R22 is negligible. + // We now compute the complete orthogonal decomposition by applying + // Householder transformations from the right to the upper trapezoidal + // matrix X = [R11 R12] to zero out R12 and obtain the factorization + // [R11 R12] = [T11 0] * Z, where T11 is r-by-r upper triangular and + // Z = Z(0) * Z(1) ... Z(r-1) is an n-by-n orthogonal matrix. + // We store the data representing Z in R12 and m_zCoeffs. + for (Index k = rank - 1; k >= 0; --k) { + if (k != rank - 1) { + // Given the API for Householder reflectors, it is more convenient if + // we swap the leading parts of columns k and r-1 (zero-based) to form + // the matrix X_k = [X(0:k, k), X(0:k, r:n)] + m_cpqr.m_qr.col(k).head(k + 1).swap( + m_cpqr.m_qr.col(rank - 1).head(k + 1)); + } + // Construct Householder reflector Z(k) to zero out the last row of X_k, + // i.e. choose Z(k) such that + // [X(k, k), X(k, r:n)] * Z(k) = [beta, 0, .., 0]. + RealScalar beta; + m_cpqr.m_qr.row(k) + .tail(cols - rank + 1) + .makeHouseholderInPlace(m_zCoeffs(k), beta); + m_cpqr.m_qr(k, rank - 1) = beta; + if (k > 0) { + // Apply Z(k) to the first k rows of X_k + m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) + .applyHouseholderOnTheRight( + m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + &m_temp(0)); + } + if (k != rank - 1) { + // Swap X(0:k,k) back to its proper location. + m_cpqr.m_qr.col(k).head(k + 1).swap( + m_cpqr.m_qr.col(rank - 1).head(k + 1)); + } + } + } + return *this; +} + +template +template +void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix temp((std::max)(cols, nrhs)); + for (Index k = 0; k < rank; ++k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).adjoint(), zCoeffs()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + +#ifndef EIGEN_PARSED_BY_DOXYGEN +template +template +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( + const RhsType& rhs, DstType& dst) const { + eigen_assert(rhs().rows() == this->rows()); + + const Index rank = this->rank(); + if (rank == 0) { + dst.setZero(); + return; + } + + // Compute c = Q^* * rhs + // Note that the matrix Q = H_0^* H_1^*... so its inverse is + // Q^* = (H_0 H_1 ...)^T + typename RhsType::PlainObject c(rhs()); + c.applyOnTheLeft( + householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + + // Solve T z = c(1:rank, :) + dst.topRows(rank) = matrixT() + .topLeftCorner(rank, rank) + .template triangularView() + .solve(c.topRows(rank)); + + const Index cols = this->cols(); + if (rank < cols) { + // Compute y = Z^* * [ z ] + // [ 0 ] + dst.bottomRows(cols - rank).setZero(); + applyZAdjointOnTheLeftInPlace(dst); + } + + // Undo permutation to get x = P^{-1} * y. + dst = colsPermutation() * dst; +} +#endif + +/** \returns the matrix Q as a sequence of householder transformations */ +template +typename CompleteOrthogonalDecomposition::HouseholderSequenceType +CompleteOrthogonalDecomposition::householderQ() const { + return m_cpqr.householderQ(); +} + +#ifndef __CUDACC__ +/** \return the complete orthogonal decomposition of \c *this. + * + * \sa class CompleteOrthogonalDecomposition + */ +template +const CompleteOrthogonalDecomposition::PlainObject> +MatrixBase::completeOrthogonalDecomposition() const { + return CompleteOrthogonalDecomposition(eval()); +} +#endif // __CUDACC__ + +} // end namespace Eigen + +#endif // EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 9c989823e..648250af6 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -10,6 +10,83 @@ #include "main.h" #include +#include + +template +void cod() { + typedef typename MatrixType::Index Index; + + Index rows = internal::random(2, EIGEN_TEST_MAX_SIZE); + Index cols = internal::random(2, EIGEN_TEST_MAX_SIZE); + Index cols2 = internal::random(2, EIGEN_TEST_MAX_SIZE); + Index rank = internal::random(1, (std::min)(rows, cols) - 1); + + typedef typename MatrixType::Scalar Scalar; + typedef Matrix + MatrixQType; + MatrixType matrix; + createRandomPIMatrixOfRank(rank, rows, cols, matrix); + CompleteOrthogonalDecomposition cod(matrix); + VERIFY(rank == cod.rank()); + VERIFY(cols - cod.rank() == cod.dimensionOfKernel()); + VERIFY(!cod.isInjective()); + VERIFY(!cod.isInvertible()); + VERIFY(!cod.isSurjective()); + + MatrixQType q = cod.householderQ(); + VERIFY_IS_UNITARY(q); + + MatrixType z = cod.matrixZ(); + VERIFY_IS_UNITARY(z); + + MatrixType t; + t.setZero(rows, cols); + t.topLeftCorner(rank, rank) = + cod.matrixT().topLeftCorner(rank, rank).template triangularView(); + + MatrixType c = q * t * z * cod.colsPermutation().inverse(); + VERIFY_IS_APPROX(matrix, c); + + MatrixType exact_solution = MatrixType::Random(cols, cols2); + MatrixType rhs = matrix * exact_solution; + MatrixType cod_solution = cod.solve(rhs); + VERIFY_IS_APPROX(rhs, matrix * cod_solution); + + // Verify that we get the same minimum-norm solution as the SVD. + JacobiSVD svd(matrix, ComputeThinU | ComputeThinV); + MatrixType svd_solution = svd.solve(rhs); + VERIFY_IS_APPROX(cod_solution, svd_solution); +} + +template +void cod_fixedsize() { + enum { + Rows = MatrixType::RowsAtCompileTime, + Cols = MatrixType::ColsAtCompileTime + }; + typedef typename MatrixType::Scalar Scalar; + int rank = internal::random(1, (std::min)(int(Rows), int(Cols)) - 1); + Matrix matrix; + createRandomPIMatrixOfRank(rank, Rows, Cols, matrix); + CompleteOrthogonalDecomposition > cod(matrix); + VERIFY(rank == cod.rank()); + VERIFY(Cols - cod.rank() == cod.dimensionOfKernel()); + VERIFY(cod.isInjective() == (rank == Rows)); + VERIFY(cod.isSurjective() == (rank == Cols)); + VERIFY(cod.isInvertible() == (cod.isInjective() && cod.isSurjective())); + + Matrix exact_solution; + exact_solution.setRandom(Cols, Cols2); + Matrix rhs = matrix * exact_solution; + Matrix cod_solution = cod.solve(rhs); + VERIFY_IS_APPROX(rhs, matrix * cod_solution); + + // Verify that we get the same minimum-norm solution as the SVD. + JacobiSVD svd(matrix, ComputeFullU | ComputeFullV); + Matrix svd_solution = svd.solve(rhs); + VERIFY_IS_APPROX(cod_solution, svd_solution); +} template void qr() { -- cgit v1.2.3 From 019fff9a009fc77a64d61a9ff201b20c8de3a991 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 6 Feb 2016 17:48:42 -0800 Subject: Add my name to copyright notice in ColPivHouseholder.h, mostly for previous work on stable norm downdate formula. --- Eigen/src/QR/ColPivHouseholderQR.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 7c559f952..cedbecb61 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -3,6 +3,7 @@ // // Copyright (C) 2008-2009 Gael Guennebaud // Copyright (C) 2009 Benoit Jacob +// Copyright (C) 2016 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed -- cgit v1.2.3 From 16ec450ca18dac57c2fdd55d134b626056d30e3e Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 6 Feb 2016 17:54:01 -0800 Subject: Nevermind. --- Eigen/src/QR/ColPivHouseholderQR.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index cedbecb61..7c559f952 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -3,7 +3,6 @@ // // Copyright (C) 2008-2009 Gael Guennebaud // Copyright (C) 2009 Benoit Jacob -// Copyright (C) 2016 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed -- cgit v1.2.3 From a4c76f8d34b79e5964f8c0cd4d560d52d0ff32c2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Feb 2016 11:33:02 +0100 Subject: Improve inlining --- Eigen/src/Core/AssignEvaluator.h | 3 ++- Eigen/src/Core/ProductEvaluators.h | 6 ++++-- Eigen/src/Core/TriangularMatrix.h | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 5b65bfb0c..a9a524130 100755 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -806,7 +806,8 @@ struct Assignment template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment { - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); src.evalTo(dst); diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index b2a0a4b4f..3ce86e8cd 100755 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -97,7 +97,8 @@ struct product_evaluator, ProductTag, LhsShape, RhsSh Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { ::new (static_cast(this)) Base(m_result); @@ -412,7 +413,8 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename XprType::PacketScalar PacketScalar; typedef typename XprType::PacketReturnType PacketReturnType; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : m_lhs(xpr.lhs()), m_rhs(xpr.rhs()), m_lhsImpl(m_lhs), // FIXME the creation of the evaluator objects should result in a no-op, but check that! diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index f55b42eed..e6d137e40 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -776,7 +776,8 @@ public: }; template -EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -800,7 +801,8 @@ EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, co } template -EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src) { call_triangular_assignment_loop(dst, src, internal::assign_op()); } -- cgit v1.2.3 From c2bf2f56efb0f83c3e8aeb796825f6bfda53977b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Feb 2016 14:29:12 +0100 Subject: Remove custom unaligned loads for SSE. They were only useful for core2 CPU. --- Eigen/src/Core/arch/SSE/PacketMath.h | 37 +++--------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index eb517b871..c2071da8f 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -314,58 +314,27 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E return _mm_loadu_ps(from); #endif } - template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } #else // NOTE: with the code below, MSVC's compiler crashes! -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8))) - // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 -#elif EIGEN_COMP_CLANG - // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 -#else - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 -#endif - template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_ps(from); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_ps(res); -#endif } +#endif + template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_pd(from); -#else - __m128d res; - res = _mm_load_sd(from) ; - res = _mm_loadh_pd(res,from+1); - return res; -#endif } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_si128(reinterpret_cast(from)); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_si128(res); -#endif } -#endif + template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { -- cgit v1.2.3 From 414efa47d3e37a15ef681f86425778fbf3a09c27 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 8 Feb 2016 08:50:34 -0800 Subject: Add missing calls to tests of COD. Fix a few mistakes in 3.2 -> 3.3 port. --- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 4 ++-- test/qr_colpivoting.cpp | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index b81bb7433..4095e79e5 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -480,7 +480,7 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs().rows() == this->rows()); + eigen_assert(rhs.rows() == this->rows()); const Index rank = this->rank(); if (rank == 0) { @@ -491,7 +491,7 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Compute c = Q^* * rhs // Note that the matrix Q = H_0^* H_1^*... so its inverse is // Q^* = (H_0 H_1 ...)^T - typename RhsType::PlainObject c(rhs()); + typename RhsType::PlainObject c(rhs); c.applyOnTheLeft( householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 648250af6..16f80d8b5 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -289,6 +289,15 @@ void test_qr_colpivoting() CALL_SUBTEST_5(( qr_fixedsize, 1 >() )); } + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( cod() ); + CALL_SUBTEST_2( cod() ); + CALL_SUBTEST_3( cod() ); + CALL_SUBTEST_4(( cod_fixedsize, 4 >() )); + CALL_SUBTEST_5(( cod_fixedsize, 3 >() )); + CALL_SUBTEST_5(( cod_fixedsize, 1 >() )); + } + for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( qr_invertible() ); CALL_SUBTEST_2( qr_invertible() ); -- cgit v1.2.3 From 53f60e0afca01d9e07fd1c44d163369ae36009ca Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 8 Feb 2016 09:01:43 -0800 Subject: Make applyZAdjointOnTheLeftInPlace protected. --- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 4095e79e5..bee5bf47e 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -137,11 +137,6 @@ class CompleteOrthogonalDecomposition { HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } - /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. - */ - template - void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const; - /** \returns the matrix \b Z. */ MatrixType matrixZ() const { @@ -333,10 +328,9 @@ class CompleteOrthogonalDecomposition { RealScalar threshold() const { return m_cpqr.threshold(); } /** \returns the number of nonzero pivots in the complete orthogonal - * decomposition. - * Here nonzero is meant in the exact sense, not in a fuzzy sense. - * So that notion isn't really intrinsically interesting, but it is - * still useful when implementing algorithms. + * decomposition. Here nonzero is meant in the exact sense, not in a + * fuzzy sense. So that notion isn't really intrinsically interesting, + * but it is still useful when implementing algorithms. * * \sa rank() */ @@ -370,6 +364,11 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. + */ + template + void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const; + ColPivHouseholderQR m_cpqr; HCoeffsType m_zCoeffs; RowVectorType m_temp; -- cgit v1.2.3 From 24d291cf164591c16cc6a6b60ec38551144cc43c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 02:34:02 +0000 Subject: Worked around nvcc crash when compiling Eigen on Tegra X1 --- Eigen/src/Core/util/Macros.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index cf6b03ec7..1993cb8ec 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -354,8 +354,12 @@ // Does the compiler support variadic templates? #if __cplusplus > 199711L +// Disable the use of variadic templates when compiling with nvcc on ARM devices: +// this prevents nvcc from crashing when compiling Eigen on Tegra X1 +#if !defined(__NVCC__) || !defined(EIGEN_ARCH_ARM_OR_ARM64) #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif +#endif // Does the compiler support const expressions? #ifdef __CUDACC__ -- cgit v1.2.3 From d69946183d021c42c2670c61c3131d208f1221f4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 8 Feb 2016 21:03:59 -0800 Subject: Updated the TensorIntDivisor code to work properly on LLP64 systems --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index b58173e58..ae0de9420 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -45,8 +45,9 @@ namespace { } return leading_zeros; #else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); return (sizeof(T) == 8) ? - __builtin_clzl(static_cast(val)) : + __builtin_clzll(static_cast(val)) : __builtin_clz(static_cast(val)); #endif } -- cgit v1.2.3 From a9cc6a06b9fe2ea50b0437f73b101116a39da727 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 05:10:06 +0000 Subject: Fixed compilation warning in the splines test --- unsupported/test/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 97665af96..3be020434 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d() typedef Spline2d::PointType PointType; typedef Spline2d::KnotVectorType KnotVectorType; - const unsigned int numPoints = 100; + const Eigen::DenseIndex numPoints = 100; const unsigned int dimension = 2; const unsigned int degree = 3; -- cgit v1.2.3 From 5cc0dd5f44248db2df03274dc0ddf7b8bb60804a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 10:32:01 -0800 Subject: Fixed the code that disables the use of variadic templates when compiling with nvcc on ARM devices. --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 1993cb8ec..c11251fcb 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -356,7 +356,7 @@ #if __cplusplus > 199711L // Disable the use of variadic templates when compiling with nvcc on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 -#if !defined(__NVCC__) || !defined(EIGEN_ARCH_ARM_OR_ARM64) +#if !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif #endif -- cgit v1.2.3 From bb8811c6555cd62cff333bce3927b3b647a8c5ea Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 9 Feb 2016 20:35:20 -0800 Subject: Enable inverse() method for computing pseudo-inverse. --- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 41 +++++++++++++++----------- test/qr_colpivoting.cpp | 3 ++ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index bee5bf47e..9bc768b7c 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -70,6 +70,7 @@ class CompleteOrthogonalDecomposition { MatrixType, typename internal::remove_all< typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType; + typedef typename MatrixType::PlainObject PlainObject; private: typedef typename PermutationType::Index PermIndexType; @@ -246,26 +247,15 @@ class CompleteOrthogonalDecomposition { */ inline bool isInvertible() const { return m_cpqr.isInvertible(); } - /** \returns the inverse of the matrix of which *this is the complete + /** \returns the pseudo-inverse of the matrix of which *this is the complete * orthogonal decomposition. - * - * \note If this matrix is not invertible, the returned matrix has undefined - * coefficients. Use isInvertible() to first determine whether this matrix is - * invertible. + * \warning: Do not compute \c this->inverse()*rhs to solve a linear systems. + * It is more efficient and numerically stable to call \c this->solve(rhs). */ - - // TODO(rmlarsen): Add method for pseudo-inverse. - // inline const - // internal::solve_retval - // inverse() const - // { - // eigen_assert(m_isInitialized && "CompleteOrthogonalDecomposition is not - // initialized."); - // return internal::solve_retval - // (*this, MatrixType::Identity(m_cpqr.rows(), m_cpqr.cols())); - // } + inline const Inverse inverse() const + { + return Inverse(*this); + } inline Index rows() const { return m_cpqr.rows(); } inline Index cols() const { return m_cpqr.cols(); } @@ -513,6 +503,21 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } #endif +namespace internal { + +template +struct Assignment >, internal::assign_op, Dense2Dense, Scalar> +{ + typedef CompleteOrthogonalDecomposition CodType; + typedef Inverse SrcXprType; + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) + { + dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows())); + } +}; + +} // end namespace internal + /** \returns the matrix Q as a sequence of householder transformations */ template typename CompleteOrthogonalDecomposition::HouseholderSequenceType diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 16f80d8b5..d8672ce33 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -57,6 +57,9 @@ void cod() { JacobiSVD svd(matrix, ComputeThinU | ComputeThinV); MatrixType svd_solution = svd.solve(rhs); VERIFY_IS_APPROX(cod_solution, svd_solution); + + MatrixType pinv = cod.inverse(); + VERIFY_IS_APPROX(cod_solution, pinv * rhs); } template -- cgit v1.2.3 From 6323851ea9bcadd0512350d69bdfb45680dc754c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 20:43:41 -0800 Subject: Fixed compilation warning --- unsupported/test/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 97665af96..3be020434 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d() typedef Spline2d::PointType PointType; typedef Spline2d::KnotVectorType KnotVectorType; - const unsigned int numPoints = 100; + const Eigen::DenseIndex numPoints = 100; const unsigned int dimension = 2; const unsigned int degree = 3; -- cgit v1.2.3 From 970751ece3fe101acbb64ebb4aa469a9b1d635e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 20:55:50 -0800 Subject: Disabling the nvcc warnings in addition to the clang warnings when clang is used as a frontend for nvcc --- Eigen/src/Core/util/DisableStupidWarnings.h | 3 ++- Eigen/src/Core/util/ReenableStupidWarnings.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 829b23ac8..cb27acff7 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -41,8 +41,9 @@ #pragma clang diagnostic push #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" +#endif -#elif defined __NVCC__ +#if defined __NVCC__ // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index ea88e226c..a23fab198 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -8,7 +8,9 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __NVCC__ + #endif + + #if defined __NVCC__ // Don't reenable the diagnostic messages, as it turns out these messages need // to be disabled at the point of the template instantiation (i.e the user code) // otherwise they'll be triggeredby nvcc. -- cgit v1.2.3 From e88535634d1ed2af38a01e712d4d0157d8b8f30e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 23:32:41 -0800 Subject: Fixed some clang compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 52569a359..977dcafb0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -285,17 +285,17 @@ struct DSizes : array { } EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - (*this) = array{{firstDimension, otherDimensions...}}; - } -#else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { eigen_assert(NumDims == 1); (*this)[0] = i0; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { eigen_assert(NumDims == 2); (*this)[0] = i0; -- cgit v1.2.3 From 72ab7879f77260b6fa29d29a05ab476412529222 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 06:48:28 -0800 Subject: Fixed clang comilation warnings --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index dc6ca4909..17e485f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -342,7 +342,7 @@ class Tensor : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) + : m_storage(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index ed933b6ac..0e89033c4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -85,6 +85,13 @@ class TensorStorage, Options_> : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { + m_data = internal::conditional_aligned_new_auto(internal::array_prod(m_dimensions)); + } +#endif + EIGEN_DEVICE_FUNC TensorStorage(const Self& other) : m_data(internal::conditional_aligned_new_auto(internal::array_prod(other.m_dimensions))) , m_dimensions(other.m_dimensions) -- cgit v1.2.3 From 964a95bf5e19df3aeff63700f95a02c35b8a2592 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 10 Feb 2016 10:37:22 -0500 Subject: Work around Emscripten bug - https://github.com/kripken/emscripten/issues/4088 --- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 74f6abc37..67cc3b3ba 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -531,7 +531,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double &x) { -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && !defined(EMSCRIPTEN) return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); -- cgit v1.2.3 From 9a21b38cccf62642e982f89756694f4f6728c1c9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 08:02:04 -0800 Subject: Worked around a few clang compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 44 ++++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 6f69da34a..4a199cdd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -135,26 +135,27 @@ template class TensorMap : public Tensor return m_data[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; } } #else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const { @@ -221,27 +222,28 @@ template class TensorMap : public Tensor return m_data[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - const std::size_t NumDims = sizeof...(otherIndices) + 1; + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; } } #else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) { -- cgit v1.2.3 From e6ee18d6b46229b0028c454db5f389001649bd45 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 10 Feb 2016 11:11:49 -0500 Subject: Make the GCC workaround for sqrt GCC-only; detect Emscripten as non-GCC --- Eigen/src/Core/arch/SSE/MathFunctions.h | 4 +++- Eigen/src/Core/util/Macros.h | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 67cc3b3ba..5236f5b9a 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -531,7 +531,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double &x) { -#if EIGEN_COMP_GNUC && !defined(EMSCRIPTEN) +#if EIGEN_COMP_GNUC_STRICT + // This works around a GCC bug generating poor code for _mm_sqrt_pd + // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index c11251fcb..fc456d407 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -99,9 +99,16 @@ #define EIGEN_COMP_ARM 0 #endif +/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler +#if defined(EMSCRIPTEN) + #define EIGEN_COMP_EMSCRIPTEN 1 +#else + #define EIGEN_COMP_EMSCRIPTEN 0 +#endif + /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM ) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 -- cgit v1.2.3 From 2d523332b397af4dc01d648605ff677cef2ed1f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 08:48:05 -0800 Subject: Optimized implementation of the hyperbolic tangent function for AVX --- Eigen/src/Core/arch/AVX/MathFunctions.h | 53 +++++++++++++++++++++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 1 + 2 files changed, 54 insertions(+) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index b0e0222a4..3ea38ae07 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -265,6 +265,59 @@ pexp(const Packet8f& _x) { return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); } +// Hyperbolic Tangent function. +// Doesn't do anything fancy, just a 13/6-degree rational interpolant which +// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// fl(tanh(x)) = +/-1. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +ptanh(const Packet8f& _x) { + // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. + const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f)); + + // The monomial coefficients of the numerator polynomial (odd). + _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03); + _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02); + _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02); + _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03); + _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04); + _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04); + _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05); + + // The monomial coefficients of the denominator polynomial (even). + _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04); + _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03); + _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02); + _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02); + + // Since the polynomials are odd/even, we need x^2. + const Packet8f x2 = _mm256_mul_ps(x, x); + + // Evaluate the numerator polynomial p. + Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11); + p = pmadd(x2, p, p8f_alpha_9); + p = pmadd(x2, p, p8f_alpha_7); + p = pmadd(x2, p, p8f_alpha_5); + p = pmadd(x2, p, p8f_alpha_3); + p = pmadd(x2, p, p8f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + Packet8f q = pmadd(x2, p8f_beta_6, p8f_beta_4); + q = pmadd(x2, q, p8f_beta_2); + q = pmadd(x2, q, p8f_beta_0); + + // Divide the numerator by the denominator. + const Packet8f res = pdiv(p, q); + + // Mask-out values outside of [-8, 8]. + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f); + return _mm256_blendv_ps( + _mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)), + p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ)); +} + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp(const Packet4d& _x) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 717ae67c5..4fec14f44 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -66,6 +66,7 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, -- cgit v1.2.3 From bfb3fcd94f55ae6e6e771409c248df2ffe8bff6b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 08:52:30 -0800 Subject: Optimized implementation of the tanh function for SSE --- Eigen/src/Core/arch/SSE/MathFunctions.h | 48 +++++++++++++++++++++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 1 + 2 files changed, 49 insertions(+) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 5236f5b9a..31035c3bf 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -516,6 +516,54 @@ Packet2d prsqrt(const Packet2d& x) { return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); } +// Hyperbolic Tangent function. +// Doesn't do anything fancy, just a 13/6-degree rational interpolant which +// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// fl(tanh(x)) = +/-1. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& _x) { + // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. + const Packet4f x = + pmax(pset1(-1.0f), + pmin(pset1(1.0f), pmul(_x, pset1(0.125f)))); + + // The monomial coefficients of the numerator polynomial (odd). + _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03); + _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02); + _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02); + _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03); + _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04); + _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04); + _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05); + + // The monomial coefficients of the denominator polynomial (even). + _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04); + _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03); + _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02); + _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02); + + // Since the polynomials are odd/even, we need x^2. + const Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); +} + } // end namespace internal namespace numext { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index c2071da8f..451034560 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -109,6 +109,7 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, HasBlend = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 -- cgit v1.2.3 From 9d6f1ad398fe8bf8779619ac236665c31829a08e Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 10 Feb 2016 12:48:34 -0500 Subject: I'm told to use __EMSCRIPTEN__ by an Emscripten dev. --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index fc456d407..d196123c6 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -100,7 +100,7 @@ #endif /// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler -#if defined(EMSCRIPTEN) +#if defined(__EMSCRIPTEN__) #define EIGEN_COMP_EMSCRIPTEN 1 #else #define EIGEN_COMP_EMSCRIPTEN 0 -- cgit v1.2.3 From b6fdf7468c7030a540e042106cf9df9b44dccf43 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 10 Feb 2016 13:03:07 -0800 Subject: Rename inverse -> pseudoInverse. --- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 4 ++-- test/qr_colpivoting.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 9bc768b7c..e71944fd7 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -249,10 +249,10 @@ class CompleteOrthogonalDecomposition { /** \returns the pseudo-inverse of the matrix of which *this is the complete * orthogonal decomposition. - * \warning: Do not compute \c this->inverse()*rhs to solve a linear systems. + * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems. * It is more efficient and numerically stable to call \c this->solve(rhs). */ - inline const Inverse inverse() const + inline const Inverse pseudoInverse() const { return Inverse(*this); } diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index d8672ce33..46c54b74f 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -58,7 +58,7 @@ void cod() { MatrixType svd_solution = svd.solve(rhs); VERIFY_IS_APPROX(cod_solution, svd_solution); - MatrixType pinv = cod.inverse(); + MatrixType pinv = cod.pseudoInverse(); VERIFY_IS_APPROX(cod_solution, pinv * rhs); } -- cgit v1.2.3 From 1dfaafe28a34c8cc6f0a9dbbc374ca470eae8a01 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 17:41:47 -0800 Subject: Added a regression test for tanh --- test/packetmath.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index e09a361bf..9e89f85c1 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -325,6 +325,12 @@ template void packetmath_real() data2[i] = internal::random(-87,88); } CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp); + for (int i=0; i(-1,1) * std::pow(Scalar(10), internal::random(-6,6)); + data2[i] = internal::random(-1,1) * std::pow(Scalar(10), internal::random(-6,6)); + } + CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh); if(PacketTraits::HasExp && PacketTraits::size>=2) { data1[0] = std::numeric_limits::quiet_NaN(); -- cgit v1.2.3 From 6d8b1dce06e5d3d670f510e88071afaf4458b458 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 10 Feb 2016 18:07:11 -0800 Subject: Avoid implicit cast from double to float. --- Eigen/src/Core/arch/AVX/MathFunctions.h | 22 +++++++++++----------- Eigen/src/Core/arch/SSE/MathFunctions.h | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 3ea38ae07..a24bf6e26 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -276,19 +276,19 @@ ptanh(const Packet8f& _x) { const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f)); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03); - _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02); - _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02); - _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03); - _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04); - _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04); - _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05); + _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04); - _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03); - _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02); - _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02); + _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f); + _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f); + _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f); // Since the polynomials are odd/even, we need x^2. const Packet8f x2 = _mm256_mul_ps(x, x); diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 31035c3bf..a7a0d906f 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -529,19 +529,19 @@ ptanh(const Packet4f& _x) { pmin(pset1(1.0f), pmul(_x, pset1(0.125f)))); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03); - _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02); - _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02); - _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03); - _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04); - _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04); - _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05); + _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04); - _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03); - _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02); - _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02); + _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f); + _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f); + _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f); // Since the polynomials are odd/even, we need x^2. const Packet4f x2 = pmul(x, x); -- cgit v1.2.3 From 8cc9232b9a58016eebf6c3c4a51143cdec89144e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 11 Feb 2016 15:32:56 +0100 Subject: bug #774: fix a numerical issue producing unwanted reflections. --- Eigen/src/Geometry/Umeyama.h | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 8d9b7a154..6943f719e 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -135,22 +135,11 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if (sigma.determinant() Scalar(0) ) { - Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose(); - } else { - const Scalar s = S(m-1); S(m-1) = Scalar(-1); - Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); - S(m-1) = s; - } - } else { - Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); - } + + if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) + S(m-1) = -1; + + Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); if (with_scaling) { -- cgit v1.2.3 From c569cfe12ae6b6bf246e915f0b03ca983c9f225c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 11 Feb 2016 09:33:32 -0800 Subject: Inline the +=, -=, *= and /= operators consistently between DenseBase.h and SelfCwiseBinaryOp.h --- Eigen/src/Core/SelfCwiseBinaryOp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index 38185d9d7..78fff1549 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -13,7 +13,7 @@ namespace Eigen { template -inline Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); @@ -21,7 +21,7 @@ inline Derived& DenseBase::operator*=(const Scalar& other) } template -inline Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); @@ -29,7 +29,7 @@ inline Derived& ArrayBase::operator+=(const Scalar& other) } template -inline Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); @@ -37,7 +37,7 @@ inline Derived& ArrayBase::operator-=(const Scalar& other) } template -inline Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); -- cgit v1.2.3 From eeac46f98012ba4a69060f8d3bc365e04f1edaa7 Mon Sep 17 00:00:00 2001 From: Hauke Heibel Date: Thu, 11 Feb 2016 19:38:37 +0100 Subject: bug #774: re-added comment referencing equations in the original paper --- Eigen/src/Geometry/Umeyama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 6943f719e..7e933fca1 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -139,6 +139,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) S(m-1) = -1; + // Eq. (40) and (43) Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); if (with_scaling) -- cgit v1.2.3 From 3628f7655d5063c4a7e67c6efc9e4ba10c31892c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 11 Feb 2016 15:05:03 -0800 Subject: Made it possible to run the scalar_binary_pow_op functor on GPU --- Eigen/src/Core/MathFunctions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e87b60f8f..447f1b834 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -496,7 +496,7 @@ template struct pow_default_impl { typedef Scalar retval; - static inline Scalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(pow); return pow(x, y); @@ -506,7 +506,7 @@ struct pow_default_impl template struct pow_default_impl { - static inline Scalar run(Scalar x, Scalar y) + static EIGEN_DEVICE_FUNC inline Scalar run(Scalar x, Scalar y) { Scalar res(1); eigen_assert(!NumTraits::IsSigned || y >= 0); -- cgit v1.2.3 From de345eff2e7e41505224e04c47e2a91b020b5a5a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 11 Feb 2016 16:34:07 -0800 Subject: Added a method to conjugate the content of a tensor or the result of a tensor expression. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++++ unsupported/test/cxx11_tensor_of_complex.cpp | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index cca716d6f..4dea1d3a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -170,6 +170,12 @@ class TensorBase return unaryExpr(internal::scalar_abs_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + conjugate() const { + return unaryExpr(internal::scalar_conjugate_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> pow(Scalar exponent) const { diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index 8ad04f699..25e51143e 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -48,6 +48,25 @@ static void test_abs() } +static void test_conjugate() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + Tensor data3(3); + data1.setRandom(); + data2.setRandom(); + data3.setRandom(); + + Tensor, 1> conj1 = data1.conjugate(); + Tensor, 1> conj2 = data2.conjugate(); + Tensor conj3 = data3.conjugate(); + for (int i = 0; i < 3; ++i) { + VERIFY_IS_APPROX(conj1(i), std::conj(data1(i))); + VERIFY_IS_APPROX(conj2(i), std::conj(data2(i))); + VERIFY_IS_APPROX(conj3(i), data3(i)); + } +} + static void test_contractions() { Tensor, 4> t_left(30, 50, 8, 31); @@ -77,5 +96,6 @@ void test_cxx11_tensor_of_complex() { CALL_SUBTEST(test_additions()); CALL_SUBTEST(test_abs()); + CALL_SUBTEST(test_conjugate()); CALL_SUBTEST(test_contractions()); } -- cgit v1.2.3 From 9e3f3a2d272d6efa6845cd560da1a5546f93ff61 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 11 Feb 2016 17:27:35 -0800 Subject: Deleted outdated comment --- unsupported/test/cxx11_tensor_cuda.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 60f9314a5..58da21d3b 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -7,8 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_cuda -- cgit v1.2.3 From b35d1a122ec2702cb5e6a262b6d34b3098f998b3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 15:31:16 +0100 Subject: Fix unit test: accessing elements in a deque by offsetting a pointer to another element causes undefined behavior. --- test/stddeque_overload.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp index d887e35ba..4da618bbf 100644 --- a/test/stddeque_overload.cpp +++ b/test/stddeque_overload.cpp @@ -48,7 +48,6 @@ void check_stddeque_matrix(const MatrixType& m) VERIFY_IS_APPROX(v[21], y); v.push_back(x); VERIFY_IS_APPROX(v[22], x); - VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType)); // do a lot of push_back such that the deque gets internally resized // (with memory reallocation) -- cgit v1.2.3 From 0a537cb2d87ada8206ec2271fb9f2904a18ccfce Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 15:58:31 +0100 Subject: bug #901: fix triangular-view with unit diagonal of sparse rectangular matrices. --- Eigen/src/SparseCore/SparseTriangularView.h | 21 ++++++++++++--------- test/sparse_basic.cpp | 5 ++--- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h index 7c718e4e1..2c6aedaf9 100644 --- a/Eigen/src/SparseCore/SparseTriangularView.h +++ b/Eigen/src/SparseCore/SparseTriangularView.h @@ -70,20 +70,20 @@ class TriangularViewImpl::InnerIterator : public MatrixT public: EIGEN_STRONG_INLINE InnerIterator(const TriangularViewImpl& view, Index outer) - : Base(view.derived().nestedExpression(), outer), m_returnOne(false) + : Base(view.derived().nestedExpression(), outer), m_returnOne(false), m_containsDiag(Base::outer()index()<=outer : this->index()=Base::outer())) { if((!SkipFirst) && Base::operator bool()) Base::operator++(); - m_returnOne = true; + m_returnOne = m_containsDiag; } } @@ -98,7 +98,7 @@ class TriangularViewImpl::InnerIterator : public MatrixT { if((!SkipFirst) && Base::operator bool()) Base::operator++(); - m_returnOne = true; + m_returnOne = m_containsDiag; } } return *this; @@ -130,6 +130,7 @@ class TriangularViewImpl::InnerIterator : public MatrixT } protected: bool m_returnOne; + bool m_containsDiag; }; template @@ -193,7 +194,7 @@ public: Flags = XprType::Flags }; - explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {} + explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()), m_arg(xpr.nestedExpression()) {} inline Index nonZerosEstimate() const { return m_argImpl.nonZerosEstimate(); @@ -205,20 +206,20 @@ public: public: EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& xprEval, Index outer) - : Base(xprEval.m_argImpl,outer), m_returnOne(false) + : Base(xprEval.m_argImpl,outer), m_returnOne(false), m_containsDiag(Base::outer()index()<=outer : this->index()=Base::outer())) { if((!SkipFirst) && Base::operator bool()) Base::operator++(); - m_returnOne = true; // FIXME check innerSize()>outer(); + m_returnOne = m_containsDiag; } } @@ -233,7 +234,7 @@ public: { if((!SkipFirst) && Base::operator bool()) Base::operator++(); - m_returnOne = true; // FIXME check innerSize()>outer(); + m_returnOne = m_containsDiag; } } return *this; @@ -266,12 +267,14 @@ public: protected: bool m_returnOne; + bool m_containsDiag; private: Scalar& valueRef(); }; protected: evaluator m_argImpl; + const ArgType& m_arg; }; } // end namespace internal diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 0a06c828b..cb8ebaedf 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -21,8 +21,8 @@ template void sparse_basic(const SparseMatrixType& re const Index rows = ref.rows(); const Index cols = ref.cols(); - const Index inner = ref.innerSize(); - const Index outer = ref.outerSize(); + //const Index inner = ref.innerSize(); + //const Index outer = ref.outerSize(); typedef typename SparseMatrixType::Scalar Scalar; enum { Flags = SparseMatrixType::Flags }; @@ -327,7 +327,6 @@ template void sparse_basic(const SparseMatrixType& re m3 = m2.template triangularView(); VERIFY_IS_APPROX(m3, refMat3); - if(inner>=outer) // FIXME this should be implemented for outer>inner as well { refMat3 = refMat2.template triangularView(); m3 = m2.template triangularView(); -- cgit v1.2.3 From 2f5f56a8207d61c890ae47c05ad7e1ec2ac94dbb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 16:13:16 +0100 Subject: Fix usage of evaluator in sparse * permutation products. --- Eigen/src/SparseCore/SparseSelfAdjointView.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 402733cce..b92bb17e2 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -387,7 +387,10 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix Dest; typedef Matrix VectorI; + typedef evaluator MatEval; + typedef typename evaluator::InnerIterator MatIterator; + MatEval matEval(mat); Dest& dest(_dest.derived()); enum { StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor) @@ -401,7 +404,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix(it.index()); Index r = it.row(); @@ -474,12 +477,17 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix& dest(_dest.derived()); typedef Matrix VectorI; + typedef evaluator MatEval; + typedef typename evaluator::InnerIterator MatIterator; + enum { SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor, StorageOrderMatch = int(SrcOrder) == int(DstOrder), DstMode = DstOrder==RowMajor ? (_DstMode==Upper ? Lower : Upper) : _DstMode, SrcMode = SrcOrder==RowMajor ? (_SrcMode==Upper ? Lower : Upper) : _SrcMode }; + + MatEval matEval(mat); Index size = mat.rows(); VectorI count(size); @@ -488,7 +496,7 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrixj)) @@ -508,7 +516,7 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrixj)) -- cgit v1.2.3 From 4252af6897a2eb0f0bd725ef77f6cb2a979104ca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 16:13:35 +0100 Subject: Remove dead code. --- Eigen/src/SparseCore/SparseTriangularView.h | 106 ---------------------------- 1 file changed, 106 deletions(-) diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h index 2c6aedaf9..0c27855d5 100644 --- a/Eigen/src/SparseCore/SparseTriangularView.h +++ b/Eigen/src/SparseCore/SparseTriangularView.h @@ -43,9 +43,6 @@ template class TriangularViewImpl::type MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixTypeNestedCleaned; @@ -63,109 +60,6 @@ template class TriangularViewImpl -class TriangularViewImpl::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator -{ - typedef typename MatrixTypeNestedCleaned::InnerIterator Base; - public: - - EIGEN_STRONG_INLINE InnerIterator(const TriangularViewImpl& view, Index outer) - : Base(view.derived().nestedExpression(), outer), m_returnOne(false), m_containsDiag(Base::outer()index()<=outer : this->index()=Base::outer())) - { - if((!SkipFirst) && Base::operator bool()) - Base::operator++(); - m_returnOne = m_containsDiag; - } - } - - EIGEN_STRONG_INLINE InnerIterator& operator++() - { - if(HasUnitDiag && m_returnOne) - m_returnOne = false; - else - { - Base::operator++(); - if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer())) - { - if((!SkipFirst) && Base::operator bool()) - Base::operator++(); - m_returnOne = m_containsDiag; - } - } - return *this; - } - - inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); } - inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); } - inline StorageIndex index() const - { - if(HasUnitDiag && m_returnOne) return Base::outer(); - else return Base::index(); - } - inline Scalar value() const - { - if(HasUnitDiag && m_returnOne) return Scalar(1); - else return Base::value(); - } - - EIGEN_STRONG_INLINE operator bool() const - { - if(HasUnitDiag && m_returnOne) - return true; - if(SkipFirst) return Base::operator bool(); - else - { - if (SkipDiag) return (Base::operator bool() && this->index() < this->outer()); - else return (Base::operator bool() && this->index() <= this->outer()); - } - } - protected: - bool m_returnOne; - bool m_containsDiag; -}; - -template -class TriangularViewImpl::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator -{ - typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base; - public: - - EIGEN_STRONG_INLINE ReverseInnerIterator(const TriangularViewType& view, Index outer) - : Base(view.derived().nestedExpression(), outer) - { - eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal"); - if(SkipLast) { - while((*this) && (SkipDiag ? this->index()>=outer : this->index()>outer)) - --(*this); - } - } - - EIGEN_STRONG_INLINE ReverseInnerIterator& operator--() - { Base::operator--(); return *this; } - - inline Index row() const { return Base::row(); } - inline Index col() const { return Base::col(); } - - EIGEN_STRONG_INLINE operator bool() const - { - if (SkipLast) return Base::operator bool() ; - else - { - if(SkipDiag) return (Base::operator bool() && this->index() > this->outer()); - else return (Base::operator bool() && this->index() >= this->outer()); - } - } -}; - namespace internal { template -- cgit v1.2.3 From 6eff3e51852b5d15e5c21997f3bdf4ba3122696b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 17:09:28 +0100 Subject: Fix triangularView versus triangularPart. --- doc/TemplateKeyword.dox | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox index e06aba7ba..b84cfdae9 100644 --- a/doc/TemplateKeyword.dox +++ b/doc/TemplateKeyword.dox @@ -73,13 +73,13 @@ for operator<". The reason that the \c template keyword is necessary in the last example has to do with the rules for how templates are supposed to be compiled in C++. The compiler has to check the code for correct syntax at the point where the template is defined, without knowing the actual value of the template arguments (\c Derived1 -and \c Derived2 in the example). That means that the compiler cannot know that dst.triangularPart is +and \c Derived2 in the example). That means that the compiler cannot know that dst.triangularView is a member template and that the following < symbol is part of the delimiter for the template -parameter. Another possibility would be that dst.triangularPart is a member variable with the < +parameter. Another possibility would be that dst.triangularView is a member variable with the < symbol refering to the operator<() function. In fact, the compiler should choose the second -possibility, according to the standard. If dst.triangularPart is a member template (as in our case), +possibility, according to the standard. If dst.triangularView is a member template (as in our case), the programmer should specify this explicitly with the \c template keyword and write dst.template -triangularPart. +triangularView. The precise rules are rather complicated, but ignoring some subtleties we can summarize them as follows: - A dependent name is name that depends (directly or indirectly) on a template parameter. In the -- cgit v1.2.3 From c8b4c4b48a41a1744c9ad7a888e2bcad23250904 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 22:09:16 +0100 Subject: bug #795: mention allocate_shared as a condidate for aligned_allocator. --- doc/UnalignedArrayAssert.dox | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 8c97d7874..65ab16fb7 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -7,8 +7,8 @@ Hello! You are seeing this webpage because your program terminated on an asserti my_program: path/to/eigen/Eigen/src/Core/DenseStorage.h:44: Eigen::internal::matrix_array::internal::matrix_array() [with T = double, int Size = 2, int MatrixOptions = 2, bool Align = true]: -Assertion `(reinterpret_cast(array) & 0xf) == 0 && "this assertion -is explained here: http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html +Assertion `(reinterpret_cast(array) & (sizemask)) == 0 && "this assertion +is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html **** READ THIS WEB PAGE !!! ****"' failed. @@ -46,9 +46,9 @@ then you need to read this separate page: \ref TopicStructHavingEigenMembers "St Note that here, Eigen::Vector2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types". -\section c2 Cause 2: STL Containers +\section c2 Cause 2: STL Containers or manual memory allocation -If you use STL Containers such as std::vector, std::map, ..., with Eigen objects, or with classes containing Eigen objects, like this, +If you use STL Containers such as std::vector, std::map, ..., with %Eigen objects, or with classes containing %Eigen objects, like this, \code std::vector my_vector; @@ -60,6 +60,8 @@ then you need to read this separate page: \ref TopicStlContainers "Using STL Con Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". +The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c std::make_shared or \c std::allocate_shared for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers". + \section c3 Cause 3: Passing Eigen objects by value If some function in your code is getting an Eigen object passed by value, like this, -- cgit v1.2.3 From 8e1f1ba6a6cf0580da6f8756562f94b6410d5e58 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 12 Feb 2016 22:16:59 +0100 Subject: Import wiki's paragraph: "I disabled vectorization, but I'm still getting annoyed about alignment issues" --- doc/UnalignedArrayAssert.dox | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 65ab16fb7..f0f84d25f 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -109,7 +109,10 @@ Two possibilities: 128-bit alignment code and thus preserves ABI compatibility, but completely disables vectorization. -For more information, see this FAQ. +If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 128-bit alignment and the assertion, here's the explanation: + +It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization. +It doesn't disable 128bit alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path. */ -- cgit v1.2.3 From f6f057bb7d3fcd24b751cba2e70d416f4a803e1f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 15 Feb 2016 21:43:07 +0100 Subject: bug #1166: fix shortcomming in gemv when the destination is not a vector at compile-time. --- Eigen/src/Core/GeneralProduct.h | 11 +++++++---- test/product.h | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 0769a212e..53f934999 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -213,15 +213,18 @@ template<> struct gemv_dense_selector ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) * RhsBlasTraits::extractScalarFactor(rhs); + // make sure Dest is a compile-time vector type (bug 1166) + typedef typename conditional::type ActualDest; + enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal + MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal }; - gemv_static_vector_if static_dest; + gemv_static_vector_if static_dest; const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; @@ -314,7 +317,7 @@ template<> struct gemv_dense_selector actualLhs.rows(), actualLhs.cols(), LhsMapper(actualLhs.data(), actualLhs.outerStride()), RhsMapper(actualRhsPtr, 1), - dest.data(), dest.innerStride(), + dest.data(), dest.col(0).innerStride(), //NOTE if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166) actualAlpha); } }; diff --git a/test/product.h b/test/product.h index bd92309d2..45bb64958 100644 --- a/test/product.h +++ b/test/product.h @@ -144,6 +144,22 @@ template void product(const MatrixType& m) VERIFY_IS_APPROX(res.col(r).noalias() = square.adjoint() * square.col(r), (square.adjoint() * square.col(r)).eval()); VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval()); + // vector at runtime (see bug 1166) + { + RowSquareMatrixType ref(square); + ColSquareMatrixType ref2(square2); + ref = res = square; + VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square.transpose(), (ref.row(0) = m1.col(0).transpose() * square.transpose())); + VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square.transpose(), (ref.row(0) = m1.col(0).transpose() * square.transpose())); + VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square, (ref.row(0) = m1.col(0).transpose() * square)); + VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square, (ref.row(0) = m1.col(0).transpose() * square)); + ref2 = res2 = square2; + VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2.transpose(), (ref2.row(0) = m1.row(0) * square2.transpose())); + VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2.transpose(), (ref2.row(0) = m1.row(0) * square2.transpose())); + VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2, (ref2.row(0) = m1.row(0) * square2)); + VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2, (ref2.row(0) = m1.row(0) * square2)); + } + // inner product { Scalar x = square2.row(c) * square2.col(c2); -- cgit v1.2.3 From 06a2bc7c9c6af150f54605c74a95379a7c12ca28 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 17 Feb 2016 14:41:59 -0800 Subject: Tiny bugfix in SpecialFunctions: some compilers don't like doubles implicitly downcast to floats in an array constructor. --- Eigen/src/Core/SpecialFunctions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 6c6b21f98..6b4598e3e 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -182,10 +182,10 @@ struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) { const float A[] = { - -4.16666666666666666667E-3, - 3.96825396825396825397E-3, - -8.33333333333333333333E-3, - 8.33333333333333333333E-2 + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f }; float z; -- cgit v1.2.3 From 8ce46f9d8959236c0dfb6dd7dca7423d825f0c59 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 18 Feb 2016 13:24:34 -0800 Subject: Improved implementation of ptanh for SSE and AVX --- Eigen/src/Core/arch/AVX/MathFunctions.h | 42 +++++++++++++++------------------ Eigen/src/Core/arch/SSE/MathFunctions.h | 33 +++++++++++++------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index a24bf6e26..98d8e029f 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -267,31 +267,34 @@ pexp(const Packet8f& _x) { // Hyperbolic Tangent function. // Doesn't do anything fancy, just a 13/6-degree rational interpolant which -// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the // fl(tanh(x)) = +/-1. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f ptanh(const Packet8f& _x) { - // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. - const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f)); + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f); + const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x)); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f); - _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f); - _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f); - _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f); + _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f); // Since the polynomials are odd/even, we need x^2. - const Packet8f x2 = _mm256_mul_ps(x, x); + const Packet8f x2 = pmul(x, x); // Evaluate the numerator polynomial p. Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11); @@ -308,14 +311,7 @@ ptanh(const Packet8f& _x) { q = pmadd(x2, q, p8f_beta_0); // Divide the numerator by the denominator. - const Packet8f res = pdiv(p, q); - - // Mask-out values outside of [-8, 8]. - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f); - return _mm256_blendv_ps( - _mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)), - p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ)); + return pdiv(p, q); } template <> diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index a7a0d906f..28f103eeb 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -518,30 +518,31 @@ Packet2d prsqrt(const Packet2d& x) { // Hyperbolic Tangent function. // Doesn't do anything fancy, just a 13/6-degree rational interpolant which -// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the // fl(tanh(x)) = +/-1. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& _x) { - // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. - const Packet4f x = - pmax(pset1(-1.0f), - pmin(pset1(1.0f), pmul(_x, pset1(0.125f)))); + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f); + const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x)); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f); - _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f); - _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f); - _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f); + _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f); // Since the polynomials are odd/even, we need x^2. const Packet4f x2 = pmul(x, x); -- cgit v1.2.3 From 17b9fbed34cefe08b4f63dbe0734e12311eb8669 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:16:07 +0000 Subject: Added preliminary support for half floats on CUDA GPU. For now we can simply convert floats into half floats and vice versa --- Eigen/Core | 3 + Eigen/src/Core/arch/CUDA/PacketMath.h | 1 - Eigen/src/Core/arch/CUDA/TypeCasting.h | 100 +++++++++++++++++++++++++++++++++ unsupported/test/CMakeLists.txt | 8 ++- 4 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 Eigen/src/Core/arch/CUDA/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index 63602f4c3..17f864084 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -200,6 +200,7 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include + #include #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -329,7 +330,9 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" + #include "src/Core/arch/CUDA/TypeCasting.h" #endif #include "src/Core/arch/Default/Settings.h" diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d3d9f910e..d5dcc7fa3 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -21,7 +21,6 @@ namespace internal { template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; - template<> struct packet_traits : default_packet_traits { typedef float4 type; diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 000000000..a8c06ff48 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,100 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float2half(a); + #else + assert(false && "tbd"); + return half(); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __half2float(a); + #else + assert(false && "tbd"); + return 0.0f; + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +#else + assert(false && "tbd"); + return float4(); +#endif +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float22half2_rn(make_float2(a.x, a.y)); +#else + assert(false && "tbd"); + return half2(); +#endif +} + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c202cf0e4..678a0d1d7 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$") ei_add_test(BVH) endif() -ei_add_test(matrix_exponential) +#ei_add_test(matrix_exponential) ei_add_test(matrix_function) -ei_add_test(matrix_power) +#ei_add_test(matrix_power) ei_add_test(matrix_square_root) ei_add_test(alignedvector3) @@ -173,5 +173,9 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_argmax_cuda) + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_53 -Xcudafe \"--display_error_number\"") + ei_add_test(cxx11_tensor_of_float16_cuda) + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() -- cgit v1.2.3 From 7151bd876845c15cb6b8abc0886d7917ece635ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:20:50 +0000 Subject: Reverted unintended changes introduced by a bad merge --- Eigen/Core | 1 - unsupported/test/CMakeLists.txt | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 17f864084..3edbe6585 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -330,7 +330,6 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" - #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/CUDA/TypeCasting.h" #endif diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 678a0d1d7..2c686177b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$") ei_add_test(BVH) endif() -#ei_add_test(matrix_exponential) +ei_add_test(matrix_exponential) ei_add_test(matrix_function) -#ei_add_test(matrix_power) +ei_add_test(matrix_power) ei_add_test(matrix_square_root) ei_add_test(alignedvector3) -- cgit v1.2.3 From f36c0c2c65a78959f6ccbbc29c6e80f86b062bc8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:23:28 +0000 Subject: Added regression test for float16 --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 60 ++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_of_float16_cuda.cu diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu new file mode 100644 index 000000000..e9f5dd968 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -0,0 +1,60 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_conversion() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_conv.device(gpu_device) = gpu_half.cast(); + + Tensor initial(num_elem); + Tensor final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + + +void test_cxx11_tensor_of_float16_cuda() +{ + CALL_SUBTEST_1(test_cuda_conversion()); +} -- cgit v1.2.3 From 0606a0a39bcf01b0a03f0dcd17f7075fce8c402c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 18 Feb 2016 23:15:23 -0800 Subject: FP16 on CUDA are only available starting with cuda 7.5. Disable them when using an older version of CUDA --- Eigen/Core | 5 ++++- Eigen/src/Core/arch/CUDA/TypeCasting.h | 3 +++ unsupported/test/cxx11_tensor_of_float16_cuda.cu | 5 ++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 3edbe6585..834ff9415 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -200,7 +200,10 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include - #include + #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #include + #endif #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index a8c06ff48..279fd4fd0 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -14,6 +14,8 @@ namespace Eigen { namespace internal { +#if defined(EIGEN_HAS_CUDA_FP16) + template<> struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) @@ -92,6 +94,7 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #endif } +#endif } // end namespace internal diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index e9f5dd968..aee222a14 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -19,6 +19,7 @@ using Eigen::Tensor; +#ifdef EIGEN_HAS_CUDA_FP16 void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -52,9 +53,11 @@ void test_cuda_conversion() { gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } - +#endif void test_cxx11_tensor_of_float16_cuda() { +#ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); +#endif } -- cgit v1.2.3 From ac5d706a942faa275ea467009d2004a7aeb3e3fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 08:19:12 +0000 Subject: Added support for simple coefficient wise tensor expression using half floats on CUDA devices --- Eigen/Core | 1 + Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 220 +++++++++++++++++++++++ unsupported/test/cxx11_tensor_of_float16_cuda.cu | 43 +++++ 3 files changed, 264 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/PacketMathHalf.h diff --git a/Eigen/Core b/Eigen/Core index 834ff9415..7107f83d0 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -333,6 +333,7 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/CUDA/TypeCasting.h" #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..7f99376fb --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + +namespace Eigen { + +namespace internal { + +#if defined(EIGEN_HAS_CUDA_FP16) + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + assert(false && "tbd"); + return half(); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} + + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& from) { + return __half2half2(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { + return __halves2half2(a, __hadd(a, __float2half(1))); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { + return __hmul2(a, b); +} + + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); + } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __ldg((const half2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __halves2half2(__ldg(from+0), __ldg(from+1)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { + assert(false && "tbd"); + return half2(); +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + assert(false && "tbd"); + // half tmp = kernel.packet[0].y; + // kernel.packet[0].y = kernel.packet[1].x; + // kernel.packet[1].x = tmp; +} + +#endif +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index aee222a14..26c18a718 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -20,6 +20,7 @@ using Eigen::Tensor; #ifdef EIGEN_HAS_CUDA_FP16 + void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -53,11 +54,53 @@ void test_cuda_conversion() { gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } + +void test_cuda_elementwise() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + #endif + void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_element_wise()); #endif } -- cgit v1.2.3 From cd042dbbfdd4680c983d89c4f526c49d4657c05d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:03:26 +0000 Subject: Fixed a bug in the tensor type converter --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index d2defcaf4..e254c0b7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -124,9 +124,12 @@ struct PacketConverter { return internal::pcast(m_impl.template packet(index)); } else { const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = m_impl.coeff(index+i); + values[i] = converter(m_impl.coeff(index+i)); } TgtPacket rslt = internal::pload(values); return rslt; -- cgit v1.2.3 From dc26459b9910d8c1fda964917635ee8277dd2614 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:16:54 +0000 Subject: Implemented protate() for CUDA --- Eigen/src/Core/arch/CUDA/PacketMath.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d5dcc7fa3..a32b41e18 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -272,6 +272,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) return a.x * a.y; } +template +struct protate_impl +{ + static float4 run(const float4& a) { + if (offset == 0) { + return make_float4(a.x, a.y, a.z, a.w); + } + if (offset == 1) { + return make_float4(a.w, a.x, a.y, a.z); + } + if (offset == 2) { + return make_float4(a.z, a.w, a.x, a.y); + } + return make_float4(a.y, a.z, a.w, a.x); + } +}; + +template +struct protate_impl +{ + static double2 run(const double2& a) { + if (offset == 0) { + return make_double2(a.x, a.y); + } + return make_double2(a.y, a.x); + } +}; + + template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } -- cgit v1.2.3 From f7cb755299b8cdee3b8eaffd2941af9ee6d08b04 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:57:26 +0000 Subject: Added support for operators +=, -=, *= and /= on CUDA half floats --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 7f99376fb..c99c1acf7 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -39,6 +39,22 @@ __device__ half operator / (const half& a, const half& b) { __device__ half operator - (const half& a) { return __hneg(a); } +__device__ half operator += (half& a, const half& b) { + a = __hadd(a, b); + return a; +} +__device__ half operator *= (half& a, const half& b) { + a = __hmul(a, b); + return a; +} +__device__ half operator -= (half& a, const half& b) { + a = __hsub(a, b); + return a; +} +__device__ half operator /= (half& a, const half& b) { + assert(false && "tbd"); + return a; +} template<> struct is_arithmetic { enum { value = true }; }; -- cgit v1.2.3 From f3352e0fb02d1048a4c21c969b10e84185f4e5bf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:58:57 +0000 Subject: Don't make the array constructors explicit --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 56e2b8afc..eae8b996c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -42,7 +42,7 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } - explicit EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v) { EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v; -- cgit v1.2.3 From a08d2ff0c911f7f27dd8cb0ca14fa5b9419b3488 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:59:59 +0000 Subject: Started to work on contractions and reductions using half floats --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 95 +++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 26c18a718..d3cd94cd6 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -94,6 +94,97 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } +/* +void test_cuda_contractions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int rows = 101; + int cols = 101; + int num_elem = rows*cols; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, rows, cols); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims(DimPair(1, 0)); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); + gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); + + Tensor half_prec(rows, cols); + Tensor full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +void test_cuda_reductions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int size = 101; + int num_elem = size*size; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, size); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + Eigen::array redux_dim = {{0}}; + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); + + Tensor half_prec(size); + Tensor full_prec(size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < size; ++i) { + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} +*/ + #endif @@ -101,6 +192,8 @@ void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_element_wise()); + CALL_SUBTEST_1(test_cuda_elementwise()); +// CALL_SUBTEST_2(test_cuda_contractions()); +// CALL_SUBTEST_3(test_cuda_reductions()); #endif } -- cgit v1.2.3 From f268db1c4bd0510f13f0218205c2e135f2790175 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 16:31:04 +0000 Subject: Added the ability to query the minor version of a cuda device --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index e684ab8f7..3808eb155 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -247,6 +247,14 @@ struct GpuDevice { return 0; #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + return stream_->deviceProperties().minor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; -- cgit v1.2.3 From 5c4901b83a3ec15988521e195abc05e804c541dc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 10:03:19 -0800 Subject: Implemented the scalar division of 2 half floats --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index c99c1acf7..d0106f4f1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -33,8 +33,9 @@ __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } __device__ half operator / (const half& a, const half& b) { - assert(false && "tbd"); - return half(); + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); } __device__ half operator - (const half& a) { return __hneg(a); @@ -52,7 +53,7 @@ __device__ half operator -= (half& a, const half& b) { return a; } __device__ half operator /= (half& a, const half& b) { - assert(false && "tbd"); + a = a / b; return a; } -- cgit v1.2.3 From 180156ba1aefceae0bd93f056e5807a83ccbb1b5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 10:05:59 -0800 Subject: Added support for tensor reductions on half floats --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 20 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 15 ++++++++------ unsupported/test/cxx11_tensor_of_float16_cuda.cu | 23 +++++++++++++++------- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 279fd4fd0..2742a4e7b 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -34,6 +34,26 @@ template<> struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float2half(static_cast(a)); + #else + assert(false && "tbd"); + return half(); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + template<> struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index f94ffa020..e2d876140 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -72,11 +72,12 @@ template struct SumReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(0); + internal::scalar_cast_op conv; + return conv(0); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(0); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; @@ -110,11 +111,12 @@ template struct MeanReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(0); + internal::scalar_cast_op conv; + return conv(0); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(0); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum / scalarCount_; @@ -214,11 +216,12 @@ template struct ProdReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(1); + internal::scalar_cast_op conv; + return conv(1); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(1); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index d3cd94cd6..5ce96a1c2 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -93,7 +93,6 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } - /* void test_cuda_contractions() { Eigen::CudaStreamDevice stream; @@ -139,7 +138,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); -} +}*/ void test_cuda_reductions() { @@ -183,7 +182,7 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } -*/ + #endif @@ -191,9 +190,19 @@ void test_cuda_reductions() { void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 - CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_elementwise()); -// CALL_SUBTEST_2(test_cuda_contractions()); -// CALL_SUBTEST_3(test_cuda_reductions()); + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice device(&stream); + if (device.majorDeviceVersion() > 5 || + (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { + CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_elementwise()); +// CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_3(test_cuda_reductions()); + } + else { + std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; + } +#else + std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif } -- cgit v1.2.3 From 670db7988d6903fbb51c449383c4d5162d83caaf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 13:03:26 -0800 Subject: Updated the contraction code to make it compatible with half floats. --- .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 102 +++++++++++---------- 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index a5f3debc4..f5b539c7e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -99,23 +99,23 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, #define prefetchIntoRegisters(base_k) \ { \ - lhs_pf0 = Scalar(0); \ - lhs_pf1 = Scalar(0); \ - lhs_pf2 = Scalar(0); \ - lhs_pf3 = Scalar(0); \ - lhs_pf4 = Scalar(0); \ - lhs_pf5 = Scalar(0); \ - lhs_pf6 = Scalar(0); \ - lhs_pf7 = Scalar(0); \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ \ - rhs_pf0 = Scalar(0); \ - rhs_pf1 = Scalar(0); \ - rhs_pf2 = Scalar(0); \ - rhs_pf3 = Scalar(0); \ - rhs_pf4 = Scalar(0); \ - rhs_pf5 = Scalar(0); \ - rhs_pf6 = Scalar(0); \ - rhs_pf7 = Scalar(0); \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ \ if (!needs_edge_check || lhs_vert < m_size) { \ const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ @@ -261,15 +261,16 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // declare and initialize result array #define res(i, j) _res_##i##j #define initResultRow(i) \ - Scalar res(i, 0) = Scalar(0); \ - Scalar res(i, 1) = Scalar(0); \ - Scalar res(i, 2) = Scalar(0); \ - Scalar res(i, 3) = Scalar(0); \ - Scalar res(i, 4) = Scalar(0); \ - Scalar res(i, 5) = Scalar(0); \ - Scalar res(i, 6) = Scalar(0); \ - Scalar res(i, 7) = Scalar(0); \ - + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; initResultRow(0); initResultRow(1); initResultRow(2); @@ -1313,6 +1314,34 @@ struct TensorEvaluator struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + template void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side @@ -1353,28 +1382,7 @@ struct TensorEvaluator::value && - internal::is_same::value) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } - } else { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); } }; -- cgit v1.2.3 From f3643eec57a114ff444237027b2f61034a961ea8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Feb 2016 22:15:01 +0100 Subject: Add typedefs for the return type of all block methods. --- Eigen/src/plugins/BlockMethods.h | 145 ++++++++++++++++++++------------------- 1 file changed, 75 insertions(+), 70 deletions(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 9b7fdc4aa..632094e15 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -8,7 +8,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal expression type of a column */ @@ -29,6 +28,12 @@ template struct ConstNColsBlockXpr { typedef const Block struct NRowsBlockXpr { typedef Block::ColsAtCompileTime, IsRowMajor> Type; }; template struct ConstNRowsBlockXpr { typedef const Block::ColsAtCompileTime, IsRowMajor> Type; }; +/** \internal expression of a block */ +typedef Block BlockXpr; +typedef const Block ConstBlockXpr; +/** \internal expression of a block of fixed sizes */ +template struct FixedBlockXpr { typedef Block Type; }; +template struct ConstFixedBlockXpr { typedef Block Type; }; typedef VectorBlock SegmentReturnType; typedef const VectorBlock ConstSegmentReturnType; @@ -54,16 +59,16 @@ template struct ConstFixedSegmentReturnType { typedef const VectorBloc * \sa class Block, block(Index,Index) */ EIGEN_DEVICE_FUNC -inline Block block(Index startRow, Index startCol, Index blockRows, Index blockCols) +inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) { - return Block(derived(), startRow, startCol, blockRows, blockCols); + return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); } /** This is the const version of block(Index,Index,Index,Index). */ EIGEN_DEVICE_FUNC -inline const Block block(Index startRow, Index startCol, Index blockRows, Index blockCols) const +inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { - return Block(derived(), startRow, startCol, blockRows, blockCols); + return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); } @@ -80,16 +85,16 @@ inline const Block block(Index startRow, Index startCol, Index bl * \sa class Block, block(Index,Index,Index,Index) */ EIGEN_DEVICE_FUNC -inline Block topRightCorner(Index cRows, Index cCols) +inline BlockXpr topRightCorner(Index cRows, Index cCols) { - return Block(derived(), 0, cols() - cCols, cRows, cCols); + return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); } /** This is the const version of topRightCorner(Index, Index).*/ EIGEN_DEVICE_FUNC -inline const Block topRightCorner(Index cRows, Index cCols) const +inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const { - return Block(derived(), 0, cols() - cCols, cRows, cCols); + return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); } /** \returns an expression of a fixed-size top-right corner of *this. @@ -104,17 +109,17 @@ inline const Block topRightCorner(Index cRows, Index cCols) const */ template EIGEN_DEVICE_FUNC -inline Block topRightCorner() +inline typename FixedBlockXpr::Type topRightCorner() { - return Block(derived(), 0, cols() - CCols); + return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /** This is the const version of topRightCorner().*/ template EIGEN_DEVICE_FUNC -inline const Block topRightCorner() const +inline const typename ConstFixedBlockXpr::Type topRightCorner() const { - return Block(derived(), 0, cols() - CCols); + return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } /** \returns an expression of a top-right corner of *this. @@ -135,16 +140,16 @@ inline const Block topRightCorner() const * \sa class Block */ template -inline Block topRightCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { - return Block(derived(), 0, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /** This is the const version of topRightCorner(Index, Index).*/ template -inline const Block topRightCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { - return Block(derived(), 0, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } @@ -160,16 +165,16 @@ inline const Block topRightCorner(Index cRows, Inde * \sa class Block, block(Index,Index,Index,Index) */ EIGEN_DEVICE_FUNC -inline Block topLeftCorner(Index cRows, Index cCols) +inline BlockXpr topLeftCorner(Index cRows, Index cCols) { - return Block(derived(), 0, 0, cRows, cCols); + return BlockXpr(derived(), 0, 0, cRows, cCols); } /** This is the const version of topLeftCorner(Index, Index).*/ EIGEN_DEVICE_FUNC -inline const Block topLeftCorner(Index cRows, Index cCols) const +inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const { - return Block(derived(), 0, 0, cRows, cCols); + return ConstBlockXpr(derived(), 0, 0, cRows, cCols); } /** \returns an expression of a fixed-size top-left corner of *this. @@ -183,17 +188,17 @@ inline const Block topLeftCorner(Index cRows, Index cCols) const */ template EIGEN_DEVICE_FUNC -inline Block topLeftCorner() +inline typename FixedBlockXpr::Type topLeftCorner() { - return Block(derived(), 0, 0); + return typename FixedBlockXpr::Type(derived(), 0, 0); } /** This is the const version of topLeftCorner().*/ template EIGEN_DEVICE_FUNC -inline const Block topLeftCorner() const +inline const typename ConstFixedBlockXpr::Type topLeftCorner() const { - return Block(derived(), 0, 0); + return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } /** \returns an expression of a top-left corner of *this. @@ -214,16 +219,16 @@ inline const Block topLeftCorner() const * \sa class Block */ template -inline Block topLeftCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { - return Block(derived(), 0, 0, cRows, cCols); + return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /** This is the const version of topLeftCorner(Index, Index).*/ template -inline const Block topLeftCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { - return Block(derived(), 0, 0, cRows, cCols); + return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } @@ -239,16 +244,16 @@ inline const Block topLeftCorner(Index cRows, Index * \sa class Block, block(Index,Index,Index,Index) */ EIGEN_DEVICE_FUNC -inline Block bottomRightCorner(Index cRows, Index cCols) +inline BlockXpr bottomRightCorner(Index cRows, Index cCols) { - return Block(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /** This is the const version of bottomRightCorner(Index, Index).*/ EIGEN_DEVICE_FUNC -inline const Block bottomRightCorner(Index cRows, Index cCols) const +inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const { - return Block(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /** \returns an expression of a fixed-size bottom-right corner of *this. @@ -262,17 +267,17 @@ inline const Block bottomRightCorner(Index cRows, Index cCols) co */ template EIGEN_DEVICE_FUNC -inline Block bottomRightCorner() +inline typename FixedBlockXpr::Type bottomRightCorner() { - return Block(derived(), rows() - CRows, cols() - CCols); + return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /** This is the const version of bottomRightCorner().*/ template EIGEN_DEVICE_FUNC -inline const Block bottomRightCorner() const +inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const { - return Block(derived(), rows() - CRows, cols() - CCols); + return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /** \returns an expression of a bottom-right corner of *this. @@ -293,16 +298,16 @@ inline const Block bottomRightCorner() const * \sa class Block */ template -inline Block bottomRightCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { - return Block(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /** This is the const version of bottomRightCorner(Index, Index).*/ template -inline const Block bottomRightCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { - return Block(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } @@ -318,16 +323,16 @@ inline const Block bottomRightCorner(Index cRows, I * \sa class Block, block(Index,Index,Index,Index) */ EIGEN_DEVICE_FUNC -inline Block bottomLeftCorner(Index cRows, Index cCols) +inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) { - return Block(derived(), rows() - cRows, 0, cRows, cCols); + return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); } /** This is the const version of bottomLeftCorner(Index, Index).*/ EIGEN_DEVICE_FUNC -inline const Block bottomLeftCorner(Index cRows, Index cCols) const +inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const { - return Block(derived(), rows() - cRows, 0, cRows, cCols); + return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); } /** \returns an expression of a fixed-size bottom-left corner of *this. @@ -341,17 +346,17 @@ inline const Block bottomLeftCorner(Index cRows, Index cCols) con */ template EIGEN_DEVICE_FUNC -inline Block bottomLeftCorner() +inline typename FixedBlockXpr::Type bottomLeftCorner() { - return Block(derived(), rows() - CRows, 0); + return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /** This is the const version of bottomLeftCorner().*/ template EIGEN_DEVICE_FUNC -inline const Block bottomLeftCorner() const +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { - return Block(derived(), rows() - CRows, 0); + return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } /** \returns an expression of a bottom-left corner of *this. @@ -372,16 +377,16 @@ inline const Block bottomLeftCorner() const * \sa class Block */ template -inline Block bottomLeftCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { - return Block(derived(), rows() - cRows, 0, cRows, cCols); + return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /** This is the const version of bottomLeftCorner(Index, Index).*/ template -inline const Block bottomLeftCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { - return Block(derived(), rows() - cRows, 0, cRows, cCols); + return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } @@ -704,7 +709,7 @@ inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = /** \returns a fixed-size expression of a block in *this. * - * The template parameters \a BlockRows and \a BlockCols are the number of + * The template parameters \a NRows and \a NCols are the number of * rows and columns in the block. * * \param startRow the first row in the block @@ -718,25 +723,25 @@ inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = * * \sa class Block, block(Index,Index,Index,Index) */ -template +template EIGEN_DEVICE_FUNC -inline Block block(Index startRow, Index startCol) +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) { - return Block(derived(), startRow, startCol); + return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /** This is the const version of block<>(Index, Index). */ -template +template EIGEN_DEVICE_FUNC -inline const Block block(Index startRow, Index startCol) const +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { - return Block(derived(), startRow, startCol); + return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } /** \returns an expression of a block in *this. * - * \tparam BlockRows number of rows in block as specified at compile-time - * \tparam BlockCols number of columns in block as specified at compile-time + * \tparam NRows number of rows in block as specified at compile-time + * \tparam NCols number of columns in block as specified at compile-time * \param startRow the first row in the block * \param startCol the first column in the block * \param blockRows number of rows in block as specified at run-time @@ -744,27 +749,27 @@ inline const Block block(Index startRow, In * * This function is mainly useful for blocks where the number of rows is specified at compile-time * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time - * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless - * \a BlockRows is \a Dynamic, and the same for the number of columns. + * information should not contradict. In other words, \a blockRows should equal \a NRows unless + * \a NRows is \a Dynamic, and the same for the number of columns. * * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp * * \sa class Block, block(Index,Index,Index,Index) */ -template -inline Block block(Index startRow, Index startCol, +template +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { - return Block(derived(), startRow, startCol, blockRows, blockCols); + return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } /** This is the const version of block<>(Index, Index, Index, Index). */ -template -inline const Block block(Index startRow, Index startCol, +template +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { - return Block(derived(), startRow, startCol, blockRows, blockCols); + return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } /** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. -- cgit v1.2.3 From 6f0992c05b92163ed5fb0cedf03faed43805519f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Feb 2016 22:21:02 +0100 Subject: Fix nesting type and complete reflection methods of Block expressions. --- Eigen/src/Core/Block.h | 30 ++++++++++++++++++++++++++---- Eigen/src/SparseCore/SparseBlock.h | 15 +++++++++------ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index cf962aed1..661e64f3d 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -173,6 +173,7 @@ template >::type { typedef Block BlockType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; public: typedef typename internal::dense_xpr_base::type Base; @@ -294,10 +295,13 @@ template::type& nestedExpression() const + const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } EIGEN_DEVICE_FUNC StorageIndex startRow() const @@ -313,7 +317,7 @@ template m_startRow; const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; @@ -326,6 +330,7 @@ class BlockImpl_dense : public MapBase > { typedef Block BlockType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; enum { XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0 }; @@ -371,10 +376,13 @@ class BlockImpl_dense } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const + const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ EIGEN_DEVICE_FUNC @@ -392,6 +400,20 @@ class BlockImpl_dense return m_outerStride; } + EIGEN_DEVICE_FUNC + StorageIndex startRow() const + { + std::ptrdiff_t diff = Base::data() - m_xpr.data(); + return XprType::IsRowMajor ? (diff/m_xpr.outerStride()) : (diff%m_xpr.outerStride()); + } + + EIGEN_DEVICE_FUNC + StorageIndex startCol() const + { + std::ptrdiff_t diff = Base::data() - m_xpr.data(); + return XprType::IsRowMajor ? (diff%m_xpr.outerStride()) : (diff/m_xpr.outerStride()); + } + #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend... // META-FIXME there is no 'friend' keyword around here. Is this obsolete? @@ -417,7 +439,7 @@ class BlockImpl_dense : m_xpr.innerStride(); } - typename XprType::Nested m_xpr; + XprTypeNested m_xpr; Index m_outerStride; }; diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 3a811113f..00409fb37 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -28,11 +28,11 @@ protected: public: EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType) - inline BlockImpl(const XprType& xpr, Index i) + inline BlockImpl(XprType& xpr, Index i) : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize) {} - inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols)) {} @@ -61,7 +61,8 @@ public: return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index : m_outerStart); } - inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } + inline const XprType& nestedExpression() const { return m_matrix; } + inline XprType& nestedExpression() { return m_matrix; } Index startRow() const { return IsRowMajor ? m_outerStart : 0; } Index startCol() const { return IsRowMajor ? 0 : m_outerStart; } Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); } @@ -69,7 +70,7 @@ public: protected: - typename XprType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; Index m_outerStart; const internal::variable_if_dynamic m_outerSize; @@ -263,7 +264,8 @@ public: EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); } EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); } - inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } + inline const SparseMatrixType& nestedExpression() const { return m_matrix; } + inline SparseMatrixType& nestedExpression() { return m_matrix; } Index startRow() const { return IsRowMajor ? m_outerStart : 0; } Index startCol() const { return IsRowMajor ? 0 : m_outerStart; } Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); } @@ -419,7 +421,8 @@ public: m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } - inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } + inline const XprType& nestedExpression() const { return m_matrix; } + inline XprType& nestedExpression() { return m_matrix; } Index startRow() const { return m_startRow.value(); } Index startCol() const { return m_startCol.value(); } Index blockRows() const { return m_blockRows.value(); } -- cgit v1.2.3 From 46fc23f91c1c5ea21bab67976773c613bd7e4ab0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 13:44:22 -0800 Subject: Print an error message to stderr when the initialization of the CUDA runtime fails. This helps debugging setup issues. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 3808eb155..c01704e56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -34,12 +34,23 @@ static void initializeDeviceProp() { if (!m_devicePropInitialized) { int num_devices; cudaError_t status = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); + if (status != cudaSuccess) { + std::cerr << "Failed to get the number of CUDA devices: " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } m_deviceProperties = new cudaDeviceProp[num_devices]; for (int i = 0; i < num_devices; ++i) { status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - assert(status == cudaSuccess); + if (status != cudaSuccess) { + std::cerr << "Failed to initialize CUDA device #" + << i + << ": " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } } m_devicePropInitialized = true; } -- cgit v1.2.3 From 6fa35bbd280733aac7b8198201050b37f958bd9c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Feb 2016 22:58:52 +0100 Subject: bug #1170: skip calls to memcpy/memmove for empty imput. --- Eigen/src/Core/util/Memory.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 01513a59e..5f8bf15b2 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -486,7 +486,12 @@ template EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* template struct smart_copy_helper { EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) - { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); } + { + std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + memcpy(target, start, size); + } }; template struct smart_copy_helper { @@ -504,7 +509,12 @@ template void smart_memmove(const T* start, const T* end, T* target) template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { std::memmove(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); } + { + std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + std::memmove(target, start, size); + } }; template struct smart_memmove_helper { -- cgit v1.2.3 From 2af04f1a5764f083f86dda019d3fac5af12754f4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Feb 2016 22:59:28 +0100 Subject: Extend unit test to stress smart_copy with empty input/output. --- test/zerosized.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zerosized.cpp b/test/zerosized.cpp index 2404fdc2b..477ff0070 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -38,7 +38,7 @@ template void zeroSizedMatrix() if (MatrixType::RowsAtCompileTime == Dynamic && MatrixType::ColsAtCompileTime == Dynamic) { - MatrixType t2(0, 0); + MatrixType t2(0, 0), t3(t1); VERIFY(t2.rows() == 0); VERIFY(t2.cols() == 0); -- cgit v1.2.3 From 485823b5f516e96e6b387cb4c9f5fbe5d2ce26d6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Feb 2016 23:00:33 +0100 Subject: Add COD and BDCSVD in list of benched solvers. --- bench/dense_solvers.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp index f37a8bb5f..aa4ff011f 100644 --- a/bench/dense_solvers.cpp +++ b/bench/dense_solvers.cpp @@ -14,12 +14,12 @@ void bench(int id, int size = Size) Mat A(size,size); A.setRandom(); A = A*A.adjoint(); - BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd; + BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd; int tries = 3; int rep = 1000/size; if(rep==0) rep = 1; - rep = rep*rep; +// rep = rep*rep; LLT llt(A); LDLT ldlt(A); @@ -27,8 +27,10 @@ void bench(int id, int size = Size) FullPivLU fplu(A); HouseholderQR qr(A); ColPivHouseholderQR cpqr(A); + CompleteOrthogonalDecomposition cod(A); FullPivHouseholderQR fpqr(A); JacobiSVD jsvd(A.rows(),A.cols()); + BDCSVD bdcsvd(A.rows(),A.cols()); BENCH(t_llt, tries, rep, llt.compute(A)); BENCH(t_ldlt, tries, rep, ldlt.compute(A)); @@ -36,9 +38,11 @@ void bench(int id, int size = Size) BENCH(t_fplu, tries, rep, fplu.compute(A)); BENCH(t_qr, tries, rep, qr.compute(A)); BENCH(t_cpqr, tries, rep, cpqr.compute(A)); + BENCH(t_cod, tries, rep, cod.compute(A)); BENCH(t_fpqr, tries, rep, fpqr.compute(A)); if(size<500) // JacobiSVD is really too slow for too large matrices BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV)); + BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV)); results["LLT"][id] = t_llt.best(); results["LDLT"][id] = t_ldlt.best(); @@ -46,8 +50,10 @@ void bench(int id, int size = Size) results["FullPivLU"][id] = t_fplu.best(); results["HouseholderQR"][id] = t_qr.best(); results["ColPivHouseholderQR"][id] = t_cpqr.best(); + results["CompleteOrthogonalDecomposition"][id] = t_cod.best(); results["FullPivHouseholderQR"][id] = t_fpqr.best(); results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0; + results["BDCSVD"][id] = t_bdcsvd.best(); } int main() @@ -64,13 +70,15 @@ int main() IOFormat fmt(3, 0, " \t", "\n", "", ""); - std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; - std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; - std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; + std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; + std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n"; } -- cgit v1.2.3 From d5e2ec7447e9f048d6389c4a79c0ba9dd61f0370 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 19 Feb 2016 16:29:23 -0800 Subject: Speed up tensor FFT by up ~25-50%. Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_tensor_fft_single_1D_cpu/8 132 134 -1.5% BM_tensor_fft_single_1D_cpu/9 1162 1229 -5.8% BM_tensor_fft_single_1D_cpu/16 199 195 +2.0% BM_tensor_fft_single_1D_cpu/17 2587 2267 +12.4% BM_tensor_fft_single_1D_cpu/32 373 341 +8.6% BM_tensor_fft_single_1D_cpu/33 5922 4879 +17.6% BM_tensor_fft_single_1D_cpu/64 797 675 +15.3% BM_tensor_fft_single_1D_cpu/65 13580 10481 +22.8% BM_tensor_fft_single_1D_cpu/128 1753 1375 +21.6% BM_tensor_fft_single_1D_cpu/129 31426 22789 +27.5% BM_tensor_fft_single_1D_cpu/256 4005 3008 +24.9% BM_tensor_fft_single_1D_cpu/257 70910 49549 +30.1% BM_tensor_fft_single_1D_cpu/512 8989 6524 +27.4% BM_tensor_fft_single_1D_cpu/513 165402 107751 +34.9% BM_tensor_fft_single_1D_cpu/999 198293 115909 +41.5% BM_tensor_fft_single_1D_cpu/1ki 21289 14143 +33.6% BM_tensor_fft_single_1D_cpu/1k 361980 233355 +35.5% BM_tensor_fft_double_1D_cpu/8 138 131 +5.1% BM_tensor_fft_double_1D_cpu/9 1253 1133 +9.6% BM_tensor_fft_double_1D_cpu/16 218 200 +8.3% BM_tensor_fft_double_1D_cpu/17 2770 2392 +13.6% BM_tensor_fft_double_1D_cpu/32 406 368 +9.4% BM_tensor_fft_double_1D_cpu/33 6418 5153 +19.7% BM_tensor_fft_double_1D_cpu/64 856 728 +15.0% BM_tensor_fft_double_1D_cpu/65 14666 11148 +24.0% BM_tensor_fft_double_1D_cpu/128 1913 1502 +21.5% BM_tensor_fft_double_1D_cpu/129 36414 24072 +33.9% BM_tensor_fft_double_1D_cpu/256 4226 3216 +23.9% BM_tensor_fft_double_1D_cpu/257 86638 52059 +39.9% BM_tensor_fft_double_1D_cpu/512 9397 6939 +26.2% BM_tensor_fft_double_1D_cpu/513 203208 114090 +43.9% BM_tensor_fft_double_1D_cpu/999 237841 125583 +47.2% BM_tensor_fft_double_1D_cpu/1ki 20921 15392 +26.4% BM_tensor_fft_double_1D_cpu/1k 455183 250763 +44.9% BM_tensor_fft_single_2D_cpu/8 1051 1005 +4.4% BM_tensor_fft_single_2D_cpu/9 16784 14837 +11.6% BM_tensor_fft_single_2D_cpu/16 4074 3772 +7.4% BM_tensor_fft_single_2D_cpu/17 75802 63884 +15.7% BM_tensor_fft_single_2D_cpu/32 20580 16931 +17.7% BM_tensor_fft_single_2D_cpu/33 345798 278579 +19.4% BM_tensor_fft_single_2D_cpu/64 97548 81237 +16.7% BM_tensor_fft_single_2D_cpu/65 1592701 1227048 +23.0% BM_tensor_fft_single_2D_cpu/128 472318 384303 +18.6% BM_tensor_fft_single_2D_cpu/129 7038351 5445308 +22.6% BM_tensor_fft_single_2D_cpu/256 2309474 1850969 +19.9% BM_tensor_fft_single_2D_cpu/257 31849182 23797538 +25.3% BM_tensor_fft_single_2D_cpu/512 10395194 8077499 +22.3% BM_tensor_fft_single_2D_cpu/513 144053843 104242541 +27.6% BM_tensor_fft_single_2D_cpu/999 279885833 208389718 +25.5% BM_tensor_fft_single_2D_cpu/1ki 45967677 36070985 +21.5% BM_tensor_fft_single_2D_cpu/1k 619727095 456489500 +26.3% BM_tensor_fft_double_2D_cpu/8 1110 1016 +8.5% BM_tensor_fft_double_2D_cpu/9 17957 15768 +12.2% BM_tensor_fft_double_2D_cpu/16 4558 4000 +12.2% BM_tensor_fft_double_2D_cpu/17 79237 66901 +15.6% BM_tensor_fft_double_2D_cpu/32 21494 17699 +17.7% BM_tensor_fft_double_2D_cpu/33 357962 290357 +18.9% BM_tensor_fft_double_2D_cpu/64 105179 87435 +16.9% BM_tensor_fft_double_2D_cpu/65 1617143 1288006 +20.4% BM_tensor_fft_double_2D_cpu/128 512848 419397 +18.2% BM_tensor_fft_double_2D_cpu/129 7271322 5636884 +22.5% BM_tensor_fft_double_2D_cpu/256 2415529 1922032 +20.4% BM_tensor_fft_double_2D_cpu/257 32517952 24462177 +24.8% BM_tensor_fft_double_2D_cpu/512 10724898 8287617 +22.7% BM_tensor_fft_double_2D_cpu/513 146007419 108603266 +25.6% BM_tensor_fft_double_2D_cpu/999 296351330 221885776 +25.1% BM_tensor_fft_double_2D_cpu/1ki 59334166 48357539 +18.5% BM_tensor_fft_double_2D_cpu/1k 666660132 483840349 +27.4% --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 282 +++++++++++++++---------- 1 file changed, 172 insertions(+), 110 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 9e675ad0f..97552f5bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -219,19 +219,56 @@ struct TensorEvaluator, D ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); if (!is_power_of_two) { - ComplexScalar pos_j_base = ComplexScalar(std::cos(M_PI/line_len), std::sin(M_PI/line_len)); - for (Index j = 0; j < line_len + 1; ++j) { - pos_j_base_powered[j] = std::pow(pos_j_base, j * j); + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + pos_j_base_powered[0] = ComplexScalar(1, 0); + if (line_len > 1) { + const ComplexScalar pos_j_base = ComplexScalar( + std::cos(M_PI / line_len), std::sin(M_PI / line_len)); + pos_j_base_powered[1] = pos_j_base; + if (line_len > 2) { + const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + for (int i = 2; i < line_len + 1; ++i) { + pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + pos_j_base_powered[i - 1] / + pos_j_base_powered[i - 2] * pos_j_base_sq; + } + } + } + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + pos_j_base_powered[0] = ComplexScalar(1, 0); + if (line_len > 1) { + const ComplexScalar pos_j_base = ComplexScalar( + std::cos(M_PI / line_len), std::sin(M_PI / line_len)); + pos_j_base_powered[1] = pos_j_base; + if (line_len > 2) { + const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + for (int i = 2; i < line_len + 1; ++i) { + pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + pos_j_base_powered[i - 1] / + pos_j_base_powered[i - 2] * pos_j_base_sq; + } + } } } for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { - Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); // get data into line_buf - for (Index j = 0; j < line_len; ++j) { - Index offset = getIndexFromOffset(base_offset, dim, j); - line_buf[j] = buf[offset]; + const Index stride = m_strides[dim]; + if (stride == 1) { + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } } // processs the line @@ -243,14 +280,18 @@ struct TensorEvaluator, D } // write back - for (Index j = 0; j < line_len; ++j) { - const ComplexScalar div_factor = (FFTDir == FFT_FORWARD) ? ComplexScalar(1, 0) : ComplexScalar(line_len, 0); - Index offset = getIndexFromOffset(base_offset, dim, j); - buf[offset] = line_buf[j] / div_factor; + if (FFTDir == FFT_FORWARD && stride == 1) { + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } } } m_device.deallocate(line_buf); - if (!pos_j_base_powered) { + if (!is_power_of_two) { m_device.deallocate(a); m_device.deallocate(b); m_device.deallocate(pos_j_base_powered); @@ -372,109 +413,130 @@ struct TensorEvaluator, D } } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) { - eigen_assert(isPowerOfTwo(n)); - if (n == 1) { - return; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); } - else if (n == 2) { - ComplexScalar tmp = data[1]; - data[1] = data[0] - data[1]; - data[0] += tmp; - return; + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); } - else if (n == 4) { - ComplexScalar tmp[4]; - tmp[0] = data[0] + data[1]; - tmp[1] = data[0] - data[1]; - tmp[2] = data[2] + data[3]; - if(Dir == FFT_FORWARD) { - tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); - } - else { - tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); - } - data[0] = tmp[0] + tmp[2]; - data[1] = tmp[1] + tmp[3]; - data[2] = tmp[0] - tmp[2]; - data[3] = tmp[1] - tmp[3]; - return; + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); } - else if (n == 8) { - ComplexScalar tmp_1[8]; - ComplexScalar tmp_2[8]; - - tmp_1[0] = data[0] + data[1]; - tmp_1[1] = data[0] - data[1]; - tmp_1[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); - } - else { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); - } - tmp_1[4] = data[4] + data[5]; - tmp_1[5] = data[4] - data[5]; - tmp_1[6] = data[6] + data[7]; - if (Dir == FFT_FORWARD) { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); - } - else { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); - } - tmp_2[0] = tmp_1[0] + tmp_1[2]; - tmp_2[1] = tmp_1[1] + tmp_1[3]; - tmp_2[2] = tmp_1[0] - tmp_1[2]; - tmp_2[3] = tmp_1[1] - tmp_1[3]; - tmp_2[4] = tmp_1[4] + tmp_1[6]; - // SQRT2DIV2 = sqrt(2)/2 - #define SQRT2DIV2 0.7071067811865476 - if (Dir == FFT_FORWARD) { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); - } - else { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); - } - data[0] = tmp_2[0] + tmp_2[4]; - data[1] = tmp_2[1] + tmp_2[5]; - data[2] = tmp_2[2] + tmp_2[6]; - data[3] = tmp_2[3] + tmp_2[7]; - data[4] = tmp_2[0] - tmp_2[4]; - data[5] = tmp_2[1] - tmp_2[5]; - data[6] = tmp_2[2] - tmp_2[6]; - data[7] = tmp_2[3] - tmp_2[7]; - - return; + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); } - else { - compute_1D_Butterfly

(data, n/2, n_power_of_2 - 1); - compute_1D_Butterfly(data + n/2, n/2, n_power_of_2 - 1); - //Original code: - //RealScalar wtemp = std::sin(M_PI/n); - //RealScalar wpi = -std::sin(2 * M_PI/n); - RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; - RealScalar wpi; - if (Dir == FFT_FORWARD) { - wpi = m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; - } - else { - wpi = 0 - m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; - } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } - const ComplexScalar wp(wtemp, wpi); - ComplexScalar w(1.0, 0.0); - for(Index i = 0; i < n/2; i++) { - ComplexScalar temp(data[i + n/2] * w); - data[i + n/2] = data[i] - temp; - data[i] += temp; - w += w * wp; - } - return; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( + ComplexScalar* data, int n, int n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = (Dir == FFT_FORWARD) + ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] + : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const int n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (int i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( + ComplexScalar* data, int n, int n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); } } -- cgit v1.2.3 From 8eb127022b569407a88189667c4cdb62d5442f48 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 19 Feb 2016 16:33:30 -0800 Subject: Get rid of duplicate code. --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 97552f5bc..aec5f4c8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -237,24 +237,6 @@ struct TensorEvaluator, D } } } - // Compute twiddle factors - // t_n = exp(sqrt(-1) * pi * n^2 / line_len) - // for n = 0, 1,..., line_len-1. - // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(M_PI / line_len), std::sin(M_PI / line_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int i = 2; i < line_len + 1; ++i) { - pos_j_base_powered[i] = pos_j_base_powered[i - 1] * - pos_j_base_powered[i - 1] / - pos_j_base_powered[i - 2] * pos_j_base_sq; - } - } - } } for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { -- cgit v1.2.3 From 1e6fe6f046152cc7ed5fb7d9aad3f42f3217eb5b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 20 Feb 2016 07:44:17 +0000 Subject: Fixed the float16 tensor test. --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 5ce96a1c2..7449d6f8c 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -83,8 +83,10 @@ void test_cuda_elementwise() { Tensor full_prec(num_elem); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise " << i << std::endl; VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } @@ -93,12 +95,13 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } + /* void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); - int rows = 101; - int cols = 101; + int rows = 23; + int cols = 23; int num_elem = rows*cols; float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); @@ -115,8 +118,8 @@ void test_cuda_contractions() { Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, rows, cols); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims(DimPair(1, 0)); @@ -127,9 +130,11 @@ void test_cuda_contractions() { Tensor full_prec(rows, cols); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { + std::cout << "Checking contract " << i << " " << j << std::endl; VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); } } @@ -144,7 +149,7 @@ void test_cuda_contractions() { void test_cuda_reductions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); - int size = 101; + int size = 13; int num_elem = size*size; float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); @@ -170,10 +175,12 @@ void test_cuda_reductions() { Tensor half_prec(size); Tensor full_prec(size); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float)); + gpu_device.synchronize(); for (int i = 0; i < size; ++i) { + std::cout << "Checking redux " << i << std::endl; VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } -- cgit v1.2.3 From 9ff269a1d3ab3ff39df6b8f9444a3ec672b32649 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 20 Feb 2016 07:47:23 +0000 Subject: Moved some of the fp16 operators outside the Eigen namespace to workaround some nvcc limitations. --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index d0106f4f1..7af0bdc60 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -10,10 +10,6 @@ #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H #define EIGEN_PACKET_MATH_HALF_CUDA_H -namespace Eigen { - -namespace internal { - #if defined(EIGEN_HAS_CUDA_FP16) // Make sure this is only available when targeting a GPU: we don't want to @@ -41,22 +37,28 @@ __device__ half operator - (const half& a) { return __hneg(a); } __device__ half operator += (half& a, const half& b) { - a = __hadd(a, b); + a = a + b; return a; } __device__ half operator *= (half& a, const half& b) { - a = __hmul(a, b); + a = a * b; return a; } __device__ half operator -= (half& a, const half& b) { - a = __hsub(a, b); + a = a - b; return a; } __device__ half operator /= (half& a, const half& b) { a = a / b; return a; } +__device__ half __shfl_xor(half a, int) { + assert(false && "tbd"); + return a; +} +namespace Eigen { +namespace internal { template<> struct is_arithmetic { enum { value = true }; }; @@ -225,13 +227,11 @@ ptranspose(PacketBlock& kernel) { // kernel.packet[1].x = tmp; } -#endif -#endif -#endif - } // end namespace internal } // end namespace Eigen - +#endif +#endif +#endif #endif // EIGEN_PACKET_MATH_HALF_CUDA_H -- cgit v1.2.3 From 203490017f53505176be8d88206a04a2f4ce7be8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 08:49:36 -0800 Subject: Prevent unecessary Index to int conversions --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index aec5f4c8e..a5aa05da4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -230,10 +230,10 @@ struct TensorEvaluator, D pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int i = 2; i < line_len + 1; ++i) { - pos_j_base_powered[i] = pos_j_base_powered[i - 1] * - pos_j_base_powered[i - 1] / - pos_j_base_powered[i - 2] * pos_j_base_sq; + for (int j = 2; j < line_len + 1; ++j) { + pos_j_base_powered[j] = pos_j_base_powered[j - 1] * + pos_j_base_powered[j - 1] / + pos_j_base_powered[j - 2] * pos_j_base_sq; } } } @@ -468,7 +468,7 @@ struct TensorEvaluator, D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( - ComplexScalar* data, int n, int n_power_of_2) { + ComplexScalar* data, Index n, Index n_power_of_2) { // Original code: // RealScalar wtemp = std::sin(M_PI/n); // RealScalar wpi = -std::sin(2 * M_PI/n); @@ -482,9 +482,9 @@ struct TensorEvaluator, D const ComplexScalar wp_one_2 = wp_one * wp_one; const ComplexScalar wp_one_3 = wp_one_2 * wp_one; const ComplexScalar wp_one_4 = wp_one_3 * wp_one; - const int n2 = n / 2; + const Index n2 = n / 2; ComplexScalar w(1.0, 0.0); - for (int i = 0; i < n2; i += 4) { + for (Index i = 0; i < n2; i += 4) { ComplexScalar temp0(data[i + n2] * w); ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); @@ -507,7 +507,7 @@ struct TensorEvaluator, D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( - ComplexScalar* data, int n, int n_power_of_2) { + ComplexScalar* data, Index n, Index n_power_of_2) { eigen_assert(isPowerOfTwo(n)); if (n > 8) { compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); -- cgit v1.2.3 From 96a24b05cc836072ce0fd2b50c4e94ea652bd1aa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 11:16:15 -0800 Subject: Optimized casting of tensors in the case where the casting happens to be a no-op --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index e254c0b7b..21bb91d69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -195,8 +195,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + if (internal::is_same::value) { + return m_impl.evalSubExprsIfNeeded((SrcType*)data); + } m_impl.evalSubExprsIfNeeded(NULL); return true; } -- cgit v1.2.3 From ed69cbeef00eceb15c77832967d7586bd20d6ef4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 11:20:20 -0800 Subject: Added some debugging information to the test to figure out why it fails sometimes --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 7449d6f8c..98f5ad83d 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -201,6 +201,8 @@ void test_cxx11_tensor_of_float16_cuda() Eigen::GpuDevice device(&stream); if (device.majorDeviceVersion() > 5 || (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { + std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; + CALL_SUBTEST_1(test_cuda_conversion()); CALL_SUBTEST_1(test_cuda_elementwise()); // CALL_SUBTEST_2(test_cuda_contractions()); -- cgit v1.2.3 From 95fceb6452cb3337bb46a637a8244dd9709b2621 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 20:24:11 +0000 Subject: Added the ability to compute the absolute value of a half float --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 15 ++++++--- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 39 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 7af0bdc60..4a10e4fa5 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -52,9 +52,13 @@ __device__ half operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ half __shfl_xor(half a, int) { - assert(false && "tbd"); - return a; + +namespace std { +__device__ half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} } namespace Eigen { @@ -214,8 +218,9 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { } template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { - assert(false && "tbd"); - return half2(); + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; } diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 7449d6f8c..ff045db7f 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -55,6 +55,44 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } + +void test_cuda_unary() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + void test_cuda_elementwise() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -202,6 +240,7 @@ void test_cxx11_tensor_of_float16_cuda() if (device.majorDeviceVersion() > 5 || (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); // CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); -- cgit v1.2.3 From 584832cb3c46126697844d0afb9ef56b8da9f049 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 12:44:53 -0800 Subject: Implemented the ptranspose function on half floats --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 4a10e4fa5..bd42cb558 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -226,10 +226,12 @@ template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - assert(false && "tbd"); - // half tmp = kernel.packet[0].y; - // kernel.packet[0].y = kernel.packet[1].x; - // kernel.packet[1].x = tmp; + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } } // end namespace internal -- cgit v1.2.3 From 257b640463401ce042ffd223083d443b9a3690fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 21 Feb 2016 22:43:37 -0800 Subject: Fixed compilation warning generated by clang --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 21bb91d69..4e87813a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -165,6 +165,18 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template struct ConversionSubExprEval { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) { + return impl.evalSubExprsIfNeeded(data); + } +}; @@ -197,11 +209,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - if (internal::is_same::value) { - return m_impl.evalSubExprsIfNeeded((SrcType*)data); - } - m_impl.evalSubExprsIfNeeded(NULL); - return true; + return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() -- cgit v1.2.3 From 5cd00068c0830a55414cd91ea621a547d0c5097f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 22 Feb 2016 13:59:03 -0800 Subject: include in the tensor header since we now use it to better report cuda initialization errors --- unsupported/Eigen/CXX11/Tensor | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index b4f860c41..3b5be4426 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -58,6 +58,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_GPU +#include #include #if defined(__CUDACC__) #include -- cgit v1.2.3 From 6270d851e3082b272d4a2cd723ac800fa954224e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 22 Feb 2016 13:59:33 -0800 Subject: Declare the half float type as arithmetic. --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index bd42cb558..1a1b4ec3d 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -64,6 +64,7 @@ __device__ half abs(const half& a) { namespace Eigen { namespace internal { +template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct packet_traits : default_packet_traits -- cgit v1.2.3 From 72d2cf642e050b1c604d57f6c7750a53d5438942 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 22 Feb 2016 15:29:41 -0800 Subject: Deleted the coordinate based evaluation of tensor expressions, since it's hardly ever used and started to cause some issues with some versions of xcode. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 55 +--------------- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 22 +------ unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 52 +-------------- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 75 +--------------------- 4 files changed, 5 insertions(+), 199 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2ab332add..bc6021c9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -167,7 +167,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, - CoordAccess = NumDims == 5, + CoordAccess = false, RawAccess = false }; @@ -437,59 +437,6 @@ struct TensorEvaluator, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const - { - // Location of the first element of the patch. - // ColMajor - // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches - // RowMajor - // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d - const Index patch2DIndex = coords[static_cast(Layout) == static_cast(ColMajor) ? 3 : 1]; - - array inputCoords; - Index input_col_idx = patch2DIndex / m_fastInputColsEff; - Index inputCol = input_col_idx + coords[1] * m_in_row_strides - m_rowPaddingTop; - Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); - if (static_cast(Layout) == static_cast(ColMajor)) { - inputCoords[0] = coords[0]; // depth - inputCoords[1] = origInputCol; - inputCoords[2] = origInputRow; - inputCoords[3] = coords[4]; // batch - } else { - inputCoords[3] = coords[4]; // depth - inputCoords[2] = origInputCol; - inputCoords[1] = origInputRow; - inputCoords[0] = coords[0]; // batch - } - // If the computed coordinates are outside the original image perimeter, return 0. - if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - if (TensorEvaluator::CoordAccess) { - return m_impl.coeff(inputCoords); - } else { - Index inputIndex; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputIndex = - inputCoords[3] * m_patchInputStride + - inputCoords[2] * m_colInputStride + - inputCoords[1] * m_rowInputStride + - inputCoords[0]; - } else { - inputIndex = - inputCoords[1] * m_patchInputStride + - inputCoords[2] * m_colInputStride + - inputCoords[3] * m_rowInputStride + - inputCoords[4]; - } - return m_impl.coeff(inputIndex); - } - } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 11284315c..e867e450e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -318,7 +318,7 @@ struct TensorEvaluator, Devi IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, - CoordAccess = TensorEvaluator::CoordAccess, + CoordAccess = false, RawAccess = false }; @@ -457,15 +457,6 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) - { - array inputCoords; - for (int i = 0; i < NumDims; ++i) { - inputCoords = coords[i] + this->m_offsets[i]; - } - return m_impl.coeff(inputCoords); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { @@ -547,7 +538,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, - CoordAccess = TensorEvaluator::CoordAccess, + CoordAccess = false, RawAccess = false }; @@ -608,15 +599,6 @@ struct TensorEvaluator, Device> } } } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) - { - array inputCoords; - for (int i = 0; i < NumDims; ++i) { - inputCoords = coords[i] + this->m_offsets[i]; - } - return this->m_impl.coeffRef(inputCoords); - } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 2cbb820b1..57b716fd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -93,7 +93,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, - CoordAccess = true, + CoordAccess = false, RawAccess = false }; @@ -248,56 +248,6 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const - { - Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0; - // Location of the first element of the patch. - const Index patchIndex = coords[patch_coord_idx]; - - if (TensorEvaluator::CoordAccess) { - array inputCoords; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i]; - inputCoords[i] = coords[i] + patchIdx; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i+1]; - inputCoords[i] = coords[i+1] + patchIdx; - } - } - Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; - inputCoords[0] = (patchIndex + coords[coords_idx]); - return m_impl.coeff(inputCoords); - } - else { - Index inputIndex = 0; - if (Layout == ColMajor) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i+1]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } - Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; - inputIndex += (patchIndex + coords[coords_idx]); - return m_impl.coeff(inputIndex); - } - } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 52b78b261..04f4f8ffc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -180,7 +180,7 @@ struct TensorEvaluator, D PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, Layout = TensorEvaluator::Layout, - CoordAccess = NumDims == 6, + CoordAccess = false, RawAccess = false }; @@ -518,79 +518,6 @@ struct TensorEvaluator, D Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const - { - // ColMajor - // 0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches - // RowMajor - // 0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth - const Index patch3DIndex = coords[static_cast(Layout) == static_cast(ColMajor) ? 4 : 1]; - const Index colOffset = coords[static_cast(Layout) == static_cast(ColMajor) ? 3 : 2]; - const Index rowOffset= coords[static_cast(Layout) == static_cast(ColMajor) ? 2 : 3]; - const Index planeOffset = coords[static_cast(Layout) == static_cast(ColMajor) ? 1 : 4]; - - array inputCoords; - - const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows; - const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; - const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); - if (inputPlane < 0 || inputPlane >= m_input_planes_eff || - ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { - return Scalar(m_paddingValue); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - inputCoords[0] = coords[0]; // depth - inputCoords[1] = origInputPlane; - inputCoords[2] = origInputRow; - inputCoords[3] = origInputCol; - inputCoords[4] = coords[5]; // batch - } else { - inputCoords[4] = coords[5]; // depth - inputCoords[3] = origInputPlane; - inputCoords[2] = origInputRow; - inputCoords[1] = origInputCol; - inputCoords[0] = coords[0]; // batch - } - if (TensorEvaluator::CoordAccess) { - return m_impl.coeff(inputCoords); - } else { - Index inputIndex; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputIndex = - inputCoords[4] * m_otherInputStride + - inputCoords[3] * m_colInputStride + - inputCoords[2] * m_rowInputStride + - inputCoords[1] * m_planeInputStride + - inputCoords[0]; - } else { - inputIndex = - inputCoords[0] * m_otherInputStride + - inputCoords[1] * m_colInputStride + - inputCoords[2] * m_rowInputStride + - inputCoords[3] * m_planeInputStride + - inputCoords[4]; - } - return m_impl.coeff(inputIndex); - } - } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { -- cgit v1.2.3 From f442a5a5b34ede4ab4e8fe36d1c8237315ad3f04 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 23 Feb 2016 04:15:48 +0000 Subject: Updated the tensor benchmarking code to work with compilers that don't support cxx11. --- bench/tensors/tensor_benchmarks.h | 128 ++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 34 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index f3ec70a9e..688f558d0 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,7 +45,9 @@ template class BenchmarkSuite { void typeCasting(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array sizes = {{m_, k_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = k_; const TensorMap, Eigen::Aligned> A(a_, sizes); TensorMap, Eigen::Aligned> B((int*)b_, sizes); @@ -59,7 +61,9 @@ template class BenchmarkSuite { void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes = {{m_, m_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); @@ -72,7 +76,9 @@ template class BenchmarkSuite { void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes = {{m_, m_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = m_; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -100,9 +106,12 @@ template class BenchmarkSuite { } void rowChip(int num_iters) { - const Eigen::array input_size = {{k_, n_}}; + Eigen::array input_size; + input_size[0] = k_; + input_size[1] = n_; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size = {{n_}}; + Eigen::array output_size; + output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); @@ -114,9 +123,12 @@ template class BenchmarkSuite { } void colChip(int num_iters) { - const Eigen::array input_size= {{k_, n_}}; + Eigen::array input_size; + input_size[0] = k_; + input_size[1] = n_; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size = {{n_}}; + Eigen::array output_size; + output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); @@ -129,12 +141,18 @@ template class BenchmarkSuite { void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array size_a = {{m_, k_}}; + Eigen::array size_a; + size_a[0] = m_; + size_a[1] = k_; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b = {{k_, m_}}; + Eigen::array size_b; + size_b[0] = k_; + size_b[1] = m_; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array shuffle = {{1, 0}}; + Eigen::array shuffle; + shuffle[0] = 1; + shuffle[1] = 0; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -146,9 +164,13 @@ template class BenchmarkSuite { void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a = {{m_, k_-3}}; + Eigen::array size_a; + size_a[0] = m_; + size_a[1] = k_-3; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b = {{k_, m_}}; + Eigen::array size_b; + size_b[0] = k_; + size_b[1] = m_; TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array, 2> paddings; @@ -165,12 +187,18 @@ template class BenchmarkSuite { void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a = {{m_, k_}}; + Eigen::array size_a; + size_a[0] = m_; + size_a[1] = k_; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b = {{m_, k_ / 2}}; + Eigen::array size_b; + size_b[0] = m_; + size_b[1] = k_/2; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array strides = {{1, 2}}; + Eigen::array strides; + strides[0] = 1; + strides[1] = 2; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -181,13 +209,19 @@ template class BenchmarkSuite { } void broadcasting(int num_iters) { - const Eigen::array size_a = {{m_, 1}}; + Eigen::array size_a; + size_a[0] = m_; + size_a[1] = 1; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_c = {{m_, n_}}; + Eigen::array size_c; + size_c[0] = m_; + size_c[1] = n_; TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array broadcast = {{1, n_}}; + Eigen::array broadcast; + broadcast[0] = 1; + broadcast[1] = n_; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -205,7 +239,9 @@ template class BenchmarkSuite { void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes = {{m_, m_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = m_; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -221,7 +257,9 @@ template class BenchmarkSuite { void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes = {{m_, m_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = m_; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -237,7 +275,9 @@ template class BenchmarkSuite { void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes = {{m_, m_}}; + Eigen::array sizes; + sizes[0] = m_; + sizes[1] = m_; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -253,13 +293,16 @@ template class BenchmarkSuite { // Row reduction void rowReduction(int num_iters) { - const Eigen::array input_size = {{k_, n_}}; + Eigen::array input_size; + input_size[0] = k_; + input_size[1] = n_; const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; TensorMap, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array sum_along_dim = {{0}}; + Eigen::array sum_along_dim; + sum_along_dim[0] = 0; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -277,7 +320,9 @@ template class BenchmarkSuite { // Column reduction void colReduction(int num_iters) { - const Eigen::array input_size = {{k_, n_}}; + Eigen::array input_size; + input_size[0] = k_; + input_size[1] = n_; const TensorMap, Eigen::Aligned> B( b_, input_size); const Eigen::array output_size = {{k_}}; @@ -285,7 +330,8 @@ template class BenchmarkSuite { c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array sum_along_dim = {{1}}; + Eigen::array sum_along_dim; + sum_along_dim = 1; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -303,16 +349,23 @@ template class BenchmarkSuite { // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - const Eigen::array sizeA = {{m_, k_}}; - const Eigen::array sizeB = {{k_, n_}}; - const Eigen::array sizeC = {{m_, n_}}; + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; - const Eigen::array dims = {{DimPair(1, 0)}}; + Eigen::array dims; + dims[0] = DimPair(1, 0); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -324,14 +377,21 @@ template class BenchmarkSuite { } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array input_sizes = {{m_, n_}}; + Eigen::array input_sizes; + input_sizes[0] = m_; + input_sizes[1] = n_; TensorMap, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array kernel_sizes = {{kernel_x, kernel_y}}; + Eigen::array kernel_sizes; + kernel_sizes[0] = kernel_x; + kernel_sizes[1] = kernel_y; TensorMap, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array result_sizes = - {{m_ - kernel_x + 1, n_ - kernel_y + 1}}; + Eigen::array result_sizes; + result_sizes[0] = m_ - kernel_x + 1; + result_sizes[1] = n_ - kernel_y + 1; TensorMap, Eigen::Aligned> C(c_, result_sizes); - Eigen::array::Index, 2> dims = {{0, 1}}; + Eigen::array::Index, 2> dims; + dims[0] = 0; + dims[1] = 1; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { -- cgit v1.2.3 From 8cb9bfab870c1f55ea9c69233a832e92c8de189d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 23 Feb 2016 05:28:02 +0000 Subject: Extended the tensor benchmark suite to support types other than floats --- bench/tensors/tensor_benchmarks.h | 100 ++++++++++++++++----------------- bench/tensors/tensor_benchmarks_cpu.cc | 42 +++++++------- bench/tensors/tensor_benchmarks_gpu.cu | 6 +- 3 files changed, 74 insertions(+), 74 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 688f558d0..b208a401a 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -15,7 +15,7 @@ using Eigen::TensorMap; // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. -template class BenchmarkSuite { +template class BenchmarkSuite { public: BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) : m_(m), k_(k), n_(n), device_(device) { @@ -37,7 +37,7 @@ template class BenchmarkSuite { eigen_assert(m_ == k_ && k_ == n_); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); } // Record the number of values copied per second finalizeBenchmark(static_cast(m_) * m_ * num_iters); @@ -48,12 +48,12 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = k_; - const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> A(a_, sizes); TensorMap, Eigen::Aligned> B((int*)b_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - B.device(device_) = A.cast(); + B.device(device_) = A.template cast(); } // Record the number of values copied per second finalizeBenchmark(static_cast(m_) * k_ * num_iters); @@ -64,7 +64,7 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = m_; - TensorMap, Eigen::Aligned> C(c_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -79,9 +79,9 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = m_; - const TensorMap, Eigen::Aligned> A(a_, sizes); - const TensorMap, Eigen::Aligned> B(b_, sizes); - TensorMap, Eigen::Aligned> C(c_, sizes); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); const Eigen::DSizes quarter_sizes(m_/2, m_/2); const Eigen::DSizes first_quadrant(0, 0); @@ -109,10 +109,10 @@ template class BenchmarkSuite { Eigen::array input_size; input_size[0] = k_; input_size[1] = n_; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); Eigen::array output_size; output_size[0] = n_; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -126,10 +126,10 @@ template class BenchmarkSuite { Eigen::array input_size; input_size[0] = k_; input_size[1] = n_; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); Eigen::array output_size; output_size[0] = n_; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -144,11 +144,11 @@ template class BenchmarkSuite { Eigen::array size_a; size_a[0] = m_; size_a[1] = k_; - const TensorMap, Eigen::Aligned> A(a_, size_a); + const TensorMap, Eigen::Aligned> A(a_, size_a); Eigen::array size_b; size_b[0] = k_; size_b[1] = m_; - TensorMap, Eigen::Aligned> B(b_, size_b); + TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array shuffle; shuffle[0] = 1; @@ -167,11 +167,11 @@ template class BenchmarkSuite { Eigen::array size_a; size_a[0] = m_; size_a[1] = k_-3; - const TensorMap, Eigen::Aligned> A(a_, size_a); + const TensorMap, Eigen::Aligned> A(a_, size_a); Eigen::array size_b; size_b[0] = k_; size_b[1] = m_; - TensorMap, Eigen::Aligned> B(b_, size_b); + TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array, 2> paddings; paddings[0] = Eigen::IndexPair(0, 0); @@ -190,11 +190,11 @@ template class BenchmarkSuite { Eigen::array size_a; size_a[0] = m_; size_a[1] = k_; - const TensorMap, Eigen::Aligned> A(a_, size_a); + const TensorMap, Eigen::Aligned> A(a_, size_a); Eigen::array size_b; size_b[0] = m_; size_b[1] = k_/2; - TensorMap, Eigen::Aligned> B(b_, size_b); + TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array strides; strides[0] = 1; @@ -212,11 +212,11 @@ template class BenchmarkSuite { Eigen::array size_a; size_a[0] = m_; size_a[1] = 1; - const TensorMap, Eigen::Aligned> A(a_, size_a); + const TensorMap, Eigen::Aligned> A(a_, size_a); Eigen::array size_c; size_c[0] = m_; size_c[1] = n_; - TensorMap, Eigen::Aligned> C(c_, size_c); + TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST Eigen::array broadcast; @@ -242,9 +242,9 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = m_; - const TensorMap, Eigen::Aligned> A(a_, sizes); - const TensorMap, Eigen::Aligned> B(b_, sizes); - TensorMap, Eigen::Aligned> C(c_, sizes); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -260,9 +260,9 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = m_; - const TensorMap, Eigen::Aligned> A(a_, sizes); - const TensorMap, Eigen::Aligned> B(b_, sizes); - TensorMap, Eigen::Aligned> C(c_, sizes); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -278,9 +278,9 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = m_; - const TensorMap, Eigen::Aligned> A(a_, sizes); - const TensorMap, Eigen::Aligned> B(b_, sizes); - TensorMap, Eigen::Aligned> C(c_, sizes); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -296,9 +296,9 @@ template class BenchmarkSuite { Eigen::array input_size; input_size[0] = k_; input_size[1] = n_; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; @@ -323,10 +323,10 @@ template class BenchmarkSuite { Eigen::array input_size; input_size[0] = k_; input_size[1] = n_; - const TensorMap, Eigen::Aligned> B( + const TensorMap, Eigen::Aligned> B( b_, input_size); const Eigen::array output_size = {{k_}}; - TensorMap, Eigen::Aligned> C( + TensorMap, Eigen::Aligned> C( c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST @@ -359,11 +359,11 @@ template class BenchmarkSuite { sizeC[0] = m_; sizeC[1] = n_; - const TensorMap, Eigen::Aligned> A(a_, sizeA); - const TensorMap, Eigen::Aligned> B(b_, sizeB); - TensorMap, Eigen::Aligned> C(c_, sizeC); + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); - typedef typename Tensor::DimensionPair DimPair; + typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); @@ -380,16 +380,16 @@ template class BenchmarkSuite { Eigen::array input_sizes; input_sizes[0] = m_; input_sizes[1] = n_; - TensorMap, Eigen::Aligned> A(a_, input_sizes); + TensorMap, Eigen::Aligned> A(a_, input_sizes); Eigen::array kernel_sizes; kernel_sizes[0] = kernel_x; kernel_sizes[1] = kernel_y; - TensorMap, Eigen::Aligned> B(b_, kernel_sizes); + TensorMap, Eigen::Aligned> B(b_, kernel_sizes); Eigen::array result_sizes; result_sizes[0] = m_ - kernel_x + 1; result_sizes[1] = n_ - kernel_y + 1; - TensorMap, Eigen::Aligned> C(c_, result_sizes); - Eigen::array::Index, 2> dims; + TensorMap, Eigen::Aligned> C(c_, result_sizes); + Eigen::array dims; dims[0] = 0; dims[1] = 1; @@ -405,15 +405,15 @@ template class BenchmarkSuite { private: void initialize() { - a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); - b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); - c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(float)); - device_.memset(b_, 23, k_ * n_ * sizeof(float)); - device_.memset(c_, 31, m_ * n_ * sizeof(float)); + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); //BenchmarkUseRealTime(); } @@ -432,9 +432,9 @@ template class BenchmarkSuite { TensorIndex m_; TensorIndex k_; TensorIndex n_; - float* a_; - float* b_; - float* c_; + T* a_; + T* b_; + T* c_; Device device_; }; #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 6754e1a32..8947f4b7f 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -9,13 +9,13 @@ Eigen::ThreadPool pool(threads); \ Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions -#define BM_FuncCPU(FUNC, THREADS) \ - static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ - StopBenchmarkTiming(); \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); BM_FuncCPU(memcpy, 4); @@ -80,19 +80,19 @@ BM_FuncCPU(colReduction, 12); // Contractions -#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ - static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ - StopBenchmarkTiming(); \ - if (THREADS == 1) { \ - Eigen::DefaultDevice device; \ - BenchmarkSuite suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } else { \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } \ - } \ +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -138,7 +138,7 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ StopBenchmarkTiming(); \ CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index 611e8197b..a6f594382 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -12,7 +12,7 @@ StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -41,7 +41,7 @@ BM_FuncGPU(colReduction); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, D1, D2, D3); \ + BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ } \ -- cgit v1.2.3 From 1d9256f7db5db6c9f7fa915b4af868625f53502f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 23 Feb 2016 05:51:22 +0000 Subject: Updated the padding code to work with half floats --- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 39a305a93..c3f25f0df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -151,27 +151,27 @@ struct TensorEvaluator, Device for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (index - m_padding[0].first); } else { for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i+1]; } if (index < m_padding[NumDims-1].first || index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (index - m_padding[NumDims-1].first); } @@ -194,14 +194,14 @@ struct TensorEvaluator, Device { const Index idx = coords[0]; if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex = idx - m_padding[0].first; } for (int i = 1; i < NumDims; ++i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -209,14 +209,14 @@ struct TensorEvaluator, Device { const Index idx = coords[NumDims-1]; if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex = idx - m_padding[NumDims-1].first; } for (int i = NumDims - 2; i >= 0; --i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -245,11 +245,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -271,11 +271,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -304,11 +304,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -330,11 +330,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(Scalar(0)); + return internal::pset1(internal::scalar_cast_op()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. -- cgit v1.2.3 From 055000a42466670d7fd0162f026cde9ab90f9b25 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 23 Feb 2016 11:07:59 +0100 Subject: Fix startRow()/startCol() for dense Block with direct access: the initial implementation failed for empty rows/columns for which are ambiguous. --- Eigen/src/Core/Block.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 661e64f3d..2f46c878d 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -348,7 +348,9 @@ class BlockImpl_dense || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), - m_xpr(xpr) + m_xpr(xpr), + m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0), + m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0) { init(); } @@ -358,7 +360,7 @@ class BlockImpl_dense EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), - m_xpr(xpr) + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); } @@ -370,7 +372,7 @@ class BlockImpl_dense Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), - m_xpr(xpr) + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); } @@ -403,15 +405,13 @@ class BlockImpl_dense EIGEN_DEVICE_FUNC StorageIndex startRow() const { - std::ptrdiff_t diff = Base::data() - m_xpr.data(); - return XprType::IsRowMajor ? (diff/m_xpr.outerStride()) : (diff%m_xpr.outerStride()); + return m_startRow.value(); } EIGEN_DEVICE_FUNC StorageIndex startCol() const { - std::ptrdiff_t diff = Base::data() - m_xpr.data(); - return XprType::IsRowMajor ? (diff%m_xpr.outerStride()) : (diff/m_xpr.outerStride()); + return m_startCol.value(); } #ifndef __SUNPRO_CC @@ -440,6 +440,8 @@ class BlockImpl_dense } XprTypeNested m_xpr; + const internal::variable_if_dynamic m_startRow; + const internal::variable_if_dynamic m_startCol; Index m_outerStride; }; -- cgit v1.2.3 From 7a01cb8e4b647e8cf4223f30266c8bab1d3fb66c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 24 Feb 2016 16:43:01 -0800 Subject: Marked the And and Or reducers as stateless. --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index e2d876140..7796e1a88 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -240,6 +240,8 @@ template struct ProdReducer struct AndReducer { static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } @@ -253,6 +255,8 @@ struct AndReducer struct OrReducer { static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } -- cgit v1.2.3 From c36c09169e1545e287293f3f145fa5a25b47b84a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 24 Feb 2016 17:07:25 -0800 Subject: Fixed a typo in the reduction code that could prevent large full reductionsx from running properly on old cuda devices. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 22aea5ea4..f7c1a5cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -515,7 +515,7 @@ struct TensorEvaluator, Device> // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) { + (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { bool need_assign = false; if (!data) { -- cgit v1.2.3 From af199b4658963cc9cb7b91ba09b3f6f8f3c8017c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 25 Feb 2016 09:06:18 -0800 Subject: Made the CUDA architecture level a build setting. --- CMakeLists.txt | 2 ++ unsupported/test/CMakeLists.txt | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eaee5d5e2..1c979747c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,6 +336,8 @@ endif() option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) +set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") + include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 2c686177b..3be43f47f 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -162,7 +162,7 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() - set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--display_error_number\"") + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") @@ -173,9 +173,10 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_argmax_cuda) - set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_53 -Xcudafe \"--display_error_number\"") - ei_add_test(cxx11_tensor_of_float16_cuda) - + # Half floats are only supported starting with arch 5.3 + if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52) + ei_add_test(cxx11_tensor_of_float16_cuda) + endif() unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() -- cgit v1.2.3 From d9d05dd96eca8800e16fc848b66a31d33f143478 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 04:13:58 -0800 Subject: Fixed handling of long doubles on aarch64 --- unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 14a8aef58..68456f579 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -258,7 +258,7 @@ struct matrix_exp_computeUV { #if LDBL_MANT_DIG == 53 // double precision - matrix_exp_computeUV::run(arg, U, V, squarings); + matrix_exp_computeUV::run(arg.cast(), U.cast(), squarings); #else -- cgit v1.2.3 From 2cd32cad2782c467316297cec1a2b0ddff89c686 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 13:21:44 +0000 Subject: Reverted previous commit since it caused more problems than it solved --- unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 68456f579..bbb7e5776 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -257,8 +257,7 @@ struct matrix_exp_computeUV static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) { #if LDBL_MANT_DIG == 53 // double precision - - matrix_exp_computeUV::run(arg.cast(), U.cast(), squarings); + matrix_exp_computeUV::run(arg, U, V, squarings); #else -- cgit v1.2.3 From 002824e32def5c9a430acac4bd9fc05308c923bb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 12:21:25 -0800 Subject: Added benchmarks for fp16 --- bench/tensors/README | 8 ++- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 76 +++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 bench/tensors/tensor_benchmarks_fp16_gpu.cu diff --git a/bench/tensors/README b/bench/tensors/README index 6b51fe878..1de0a5786 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,8 +1,12 @@ Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. -To compile the CPU benchmarks, simply call: +To compile the floating point CPU benchmarks, simply call: g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu -To compile the GPU benchmarks, simply call: +To compile the floating point GPU benchmarks, simply call: nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu + +To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. +nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_gpu + diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu new file mode 100644 index 000000000..d841bcdac --- /dev/null +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -0,0 +1,76 @@ +#define EIGEN_USE_GPU + +#include +#include +#include + +#include "tensor_benchmarks.h" + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +//BM_FuncGPU(typeCasting); +//BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +//BM_FuncGPU(coeffWiseOp); +//BM_FuncGPU(algebraicFunc); +//BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +/*BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); +*/ + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +/* +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +*/ \ No newline at end of file -- cgit v1.2.3 From 93485d86bcddc0665939ce2c43261dfaa1b8cc90 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 12:24:58 -0800 Subject: Added benchmarks for type casting of float16 --- bench/tensors/tensor_benchmarks.h | 6 +++--- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index b208a401a..131d056b4 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -48,12 +48,12 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = k_; - const TensorMap, Eigen::Aligned> A(a_, sizes); - TensorMap, Eigen::Aligned> B((int*)b_, sizes); + const TensorMap, Eigen::Aligned> A((int*)a_, sizes); + TensorMap, Eigen::Aligned> B(b_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - B.device(device_) = A.template cast(); + B.device(device_) = A.template cast(); } // Record the number of values copied per second finalizeBenchmark(static_cast(m_) * k_ * num_iters); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index d841bcdac..49f75472a 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -19,7 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); -//BM_FuncGPU(typeCasting); +BM_FuncGPU(typeCasting); //BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(rowChip); -- cgit v1.2.3 From caa54d888f8873f7e19a2b97f4b90039e54c66b2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 12:38:18 -0800 Subject: Made the TensorIndexList usable on GPU without having to use the -relaxed-constexpr compilation flag --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 92 +++++++++++----------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 74ce6d0ec..01c31c13e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -39,19 +39,19 @@ namespace Eigen { template struct type2index { static const DenseIndex value = n; - constexpr operator DenseIndex() const { return n; } - void set(DenseIndex val) { + EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } + EIGEN_DEVICE_FUNC void set(DenseIndex val) { eigen_assert(val == n); } }; namespace internal { template -void update_value(T& val, DenseIndex new_val) { +EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { val = new_val; } template -void update_value(type2index& val, DenseIndex new_val) { +EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { val.set(new_val); } @@ -85,8 +85,8 @@ struct IndexTuple; template struct IndexTuple { - constexpr IndexTuple() : head(), others() { } - constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } constexpr static int count = 1 + sizeof...(O); T head; @@ -97,8 +97,8 @@ struct IndexTuple { template struct IndexTuple { - constexpr IndexTuple() : head() { } - constexpr IndexTuple(const T& v) : head(v) { } + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } constexpr static int count = 1; T head; @@ -114,33 +114,33 @@ struct IndexTupleExtractor { typedef typename IndexTupleExtractor::ValType ValType; - static constexpr ValType& get_val(IndexTuple& val) { + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { return IndexTupleExtractor::get_val(val.others); } - static constexpr const ValType& get_val(const IndexTuple& val) { + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { return IndexTupleExtractor::get_val(val.others); } template - static void set_val(IndexTuple& val, V& new_val) { + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { IndexTupleExtractor::set_val(val.others, new_val); } }; - template - struct IndexTupleExtractor<0, T, O...> { +template + struct IndexTupleExtractor<0, T, O...> { - typedef T ValType; + typedef T ValType; - static constexpr ValType& get_val(IndexTuple& val) { + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { return val.head; } - static constexpr const ValType& get_val(const IndexTuple& val) { + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { return val.head; } template - static void set_val(IndexTuple& val, V& new_val) { + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { val.head = new_val; } }; @@ -148,11 +148,11 @@ struct IndexTupleExtractor { template -constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { return IndexTupleExtractor::get_val(tuple); } template -constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { return IndexTupleExtractor::get_val(tuple); } template @@ -170,11 +170,11 @@ template template struct tuple_coeff { template - static constexpr DenseIndex get(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple& t) { return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); } template - static void set(const DenseIndex i, IndexTuple& t, const DenseIndex value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const DenseIndex value) { if (i == Idx) { update_value(array_get(t), value); } else { @@ -183,19 +183,19 @@ struct tuple_coeff { } template - static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { return ((i == Idx) & is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } template - static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { return is_compile_time_constant::ValType>::value && tuple_coeff::values_up_to_known_statically(t); } template - static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { return is_compile_time_constant::ValType>::value && is_compile_time_constant::ValType>::value && array_get(t) > array_get(t) && @@ -206,27 +206,27 @@ struct tuple_coeff { template <> struct tuple_coeff<0> { template - static constexpr DenseIndex get(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return array_get<0>(t) * (i == 0); } template - static void set(const DenseIndex i, IndexTuple& t, const DenseIndex value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const DenseIndex value) { eigen_assert (i == 0); update_value(array_get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { return is_compile_time_constant::ValType>::value & (i == 0); } template - static constexpr bool values_up_to_known_statically(const IndexTuple&) { + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { return is_compile_time_constant::ValType>::value; } template - static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { return true; } }; @@ -235,7 +235,7 @@ struct tuple_coeff<0> { template - struct IndexList : internal::IndexTuple { +struct IndexList : internal::IndexTuple { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { return internal::tuple_coeff >::value-1>::get(i, *this); } @@ -246,18 +246,18 @@ template return internal::tuple_coeff >::value-1>::set(i, *this, value); } - constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } - constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } - constexpr IndexList() : internal::IndexTuple() { } + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - constexpr bool value_known_statically(const DenseIndex i) const { + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { return internal::tuple_coeff >::value-1>::value_known_statically(i, *this); } - constexpr bool all_values_known_statically() const { + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { return internal::tuple_coeff >::value-1>::values_up_to_known_statically(*this); } - constexpr bool values_statically_known_to_increase() const { + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { return internal::tuple_coeff >::value-1>::values_up_to_statically_known_to_increase(*this); } }; @@ -286,30 +286,30 @@ template struct array_size >::value; }; -template constexpr DenseIndex array_get(IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { return IndexTupleExtractor::get_val(a); } -template constexpr DenseIndex array_get(const IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { return IndexTupleExtractor::get_val(a); } template struct index_known_statically_impl { - static constexpr bool run(const DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return false; } }; template struct index_known_statically_impl > { - static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { return IndexList().value_known_statically(i); } }; template struct index_known_statically_impl > { - static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { return IndexList().value_known_statically(i); } }; @@ -324,14 +324,14 @@ struct all_indices_known_statically_impl { template struct all_indices_known_statically_impl > { - static constexpr bool run() { + EIGEN_DEVICE_FUNC static constexpr bool run() { return IndexList().all_values_known_statically(); } }; template struct all_indices_known_statically_impl > { - static constexpr bool run() { + EIGEN_DEVICE_FUNC static constexpr bool run() { return IndexList().all_values_known_statically(); } }; @@ -339,21 +339,21 @@ struct all_indices_known_statically_impl struct indices_statically_known_to_increase_impl { - static constexpr bool run() { + EIGEN_DEVICE_FUNC static constexpr bool run() { return false; } }; template struct indices_statically_known_to_increase_impl > { - static constexpr bool run() { + EIGEN_DEVICE_FUNC static constexpr bool run() { return Eigen::IndexList().values_statically_known_to_increase(); } }; template struct indices_statically_known_to_increase_impl > { - static constexpr bool run() { + EIGEN_DEVICE_FUNC static constexpr bool run() { return Eigen::IndexList().values_statically_known_to_increase(); } }; -- cgit v1.2.3 From ac2e6e0d03c6027d9a1bbef356c2e149d8a9205a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 26 Feb 2016 13:52:24 -0800 Subject: Properly vectorized the random number generators --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 101 ++++++++++++--------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 7796e1a88..528909688 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -342,17 +342,17 @@ template class UniformRandomGenerator { } template - T operator()(Index, Index = 0) const { + T operator()(Index) const { return random(); } - template - typename internal::packet_traits::type packetOp(Index, Index = 0) const { - const int packetSize = internal::packet_traits::size; + template + PacketType packetOp(Index) const { + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = random(); } - return internal::pload::type>(values); + return internal::pload(values); } private: @@ -370,22 +370,22 @@ template <> class UniformRandomGenerator { } } UniformRandomGenerator(const UniformRandomGenerator& other) { - m_generator.seed(other(0, 0) * UINT_MAX); + m_generator.seed(other(0) * UINT_MAX); m_deterministic = other.m_deterministic; } template - float operator()(Index, Index = 0) const { + float operator()(Index) const { return m_distribution(m_generator); } - template - typename internal::packet_traits::type packetOp(Index i, Index j = 0) const { - const int packetSize = internal::packet_traits::size; + template + PacketType packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX float values[packetSize]; for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i, j); + values[k] = this->operator()(i); } - return internal::pload::type>(values); + return internal::pload(values); } private: @@ -407,22 +407,22 @@ template <> class UniformRandomGenerator { } } UniformRandomGenerator(const UniformRandomGenerator& other) { - m_generator.seed(other(0, 0) * UINT_MAX); + m_generator.seed(other(0) * UINT_MAX); m_deterministic = other.m_deterministic; } template - double operator()(Index, Index = 0) const { + double operator()(Index) const { return m_distribution(m_generator); } - template - typename internal::packet_traits::type packetOp(Index i, Index j = 0) const { - const int packetSize = internal::packet_traits::size; + template + PacketType packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX double values[packetSize]; for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i, j); + values[k] = this->operator()(i); } - return internal::pload::type>(values); + return internal::pload(values); } private: @@ -458,11 +458,12 @@ template <> class UniformRandomGenerator { } template - __device__ float operator()(Index, Index = 0) const { + __device__ float operator()(Index) const { return curand_uniform(&m_state); } - template - __device__ float4 packetOp(Index, Index = 0) const { + template + __device__ float4 packetOp(Index) const { + EIGEN_STATIC_ASSERT((is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_uniform4(&m_state); } @@ -487,11 +488,12 @@ template <> class UniformRandomGenerator { curand_init(seed, tid, 0, &m_state); } template - __device__ double operator()(Index, Index = 0) const { + __device__ double operator()(Index) const { return curand_uniform_double(&m_state); } - template - __device__ double2 packetOp(Index, Index = 0) const { + template + __device__ double2 packetOp(Index) const { + EIGEN_STATIC_ASSERT((is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_uniform2_double(&m_state); } @@ -516,7 +518,7 @@ template <> class UniformRandomGenerator > { curand_init(seed, tid, 0, &m_state); } template - __device__ std::complex operator()(Index, Index = 0) const { + __device__ std::complex operator()(Index) const { float4 vals = curand_uniform4(&m_state); return std::complex(vals.x, vals.y); } @@ -542,7 +544,7 @@ template <> class UniformRandomGenerator > { curand_init(seed, tid, 0, &m_state); } template - __device__ std::complex operator()(Index, Index = 0) const { + __device__ std::complex operator()(Index) const { double2 vals = curand_uniform2_double(&m_state); return std::complex(vals.x, vals.y); } @@ -554,6 +556,14 @@ template <> class UniformRandomGenerator > { #endif +template +struct functor_traits > { + enum { + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + + #if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 // We're not compiling a cuda kernel @@ -568,21 +578,21 @@ template class NormalRandomGenerator { } NormalRandomGenerator(const NormalRandomGenerator& other) : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution) { - m_generator.seed(other(0, 0) * UINT_MAX); + m_generator.seed(other(0) * UINT_MAX); } template - T operator()(Index, Index = 0) const { + T operator()(Index) const { return m_distribution(m_generator); } - template - typename internal::packet_traits::type packetOp(Index, Index = 0) const { - const int packetSize = internal::packet_traits::size; + template + PacketType packetOp(Index) const { + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = m_distribution(m_generator); } - return internal::pload::type>(values); + return internal::pload(values); } private: @@ -612,11 +622,12 @@ template <> class NormalRandomGenerator { curand_init(seed, tid, 0, &m_state); } template - __device__ float operator()(Index, Index = 0) const { + __device__ float operator()(Index) const { return curand_normal(&m_state); } - template - __device__ float4 packetOp(Index, Index = 0) const { + template + __device__ float4 packetOp(Index) const { + EIGEN_STATIC_ASSERT((is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_normal4(&m_state); } @@ -641,11 +652,12 @@ template <> class NormalRandomGenerator { curand_init(seed, tid, 0, &m_state); } template - __device__ double operator()(Index, Index = 0) const { + __device__ double operator()(Index) const { return curand_normal_double(&m_state); } - template - __device__ double2 packetOp(Index, Index = 0) const { + template + __device__ double2 packetOp(Index) const { + EIGEN_STATIC_ASSERT((is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_normal2_double(&m_state); } @@ -670,7 +682,7 @@ template <> class NormalRandomGenerator > { curand_init(seed, tid, 0, &m_state); } template - __device__ std::complex operator()(Index, Index = 0) const { + __device__ std::complex operator()(Index) const { float4 vals = curand_normal4(&m_state); return std::complex(vals.x, vals.y); } @@ -696,7 +708,7 @@ template <> class NormalRandomGenerator > { curand_init(seed, tid, 0, &m_state); } template - __device__ std::complex operator()(Index, Index = 0) const { + __device__ std::complex operator()(Index) const { double2 vals = curand_normal2_double(&m_state); return std::complex(vals.x, vals.y); } @@ -718,6 +730,13 @@ template class NormalRandomGenerator { #endif +template +struct functor_traits > { + enum { + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + template class GaussianGenerator { -- cgit v1.2.3 From 8e6faab51eff1e6f0d53c4152a14b3fbed09ed6c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 27 Feb 2016 14:55:40 +0100 Subject: bug #1172: make valuePtr and innderIndexPtr properly return null for empty matrices. --- Eigen/src/SparseCore/CompressedStorage.h | 13 +++++++++---- Eigen/src/SparseCore/SparseBlock.h | 20 ++++++++++---------- Eigen/src/SparseCore/SparseCompressedBase.h | 4 ++-- Eigen/src/SparseCore/SparseMatrix.h | 12 ++++++------ Eigen/src/SparseCore/SparseRedux.h | 4 ++-- Eigen/src/SparseCore/SparseVector.h | 9 +++++---- 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index 2199848e9..d89fa0dae 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -110,11 +110,16 @@ class CompressedStorage inline Index allocatedSize() const { return m_allocatedSize; } inline void clear() { m_size = 0; } - inline Scalar& value(Index i) { return m_values[i]; } - inline const Scalar& value(Index i) const { return m_values[i]; } + const Scalar* valuePtr() const { return m_values; } + Scalar* valuePtr() { return m_values; } + const StorageIndex* indexPtr() const { return m_indices; } + StorageIndex* indexPtr() { return m_indices; } - inline StorageIndex& index(Index i) { return m_indices[i]; } - inline const StorageIndex& index(Index i) const { return m_indices[i]; } + inline Scalar& value(Index i) { eigen_internal_assert(m_values!=0); return m_values[i]; } + inline const Scalar& value(Index i) const { eigen_internal_assert(m_values!=0); return m_values[i]; } + + inline StorageIndex& index(Index i) { eigen_internal_assert(m_indices!=0); return m_indices[i]; } + inline const StorageIndex& index(Index i) const { eigen_internal_assert(m_indices!=0); return m_indices[i]; } /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */ inline Index searchLowerIndex(Index key) const diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 00409fb37..82fae8c4b 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -145,14 +145,14 @@ public: // realloc manually to reduce copies typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz); - internal::smart_copy(&m_matrix.data().value(0), &m_matrix.data().value(0) + start, &newdata.value(0)); - internal::smart_copy(&m_matrix.data().index(0), &m_matrix.data().index(0) + start, &newdata.index(0)); + internal::smart_copy(m_matrix.valuePtr(), m_matrix.valuePtr() + start, newdata.valuePtr()); + internal::smart_copy(m_matrix.innerIndexPtr(), m_matrix.innerIndexPtr() + start, newdata.indexPtr()); - internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, &newdata.value(start)); - internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, &newdata.index(start)); + internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, newdata.valuePtr() + start); + internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, newdata.indexPtr() + start); - internal::smart_copy(&matrix.data().value(end), &matrix.data().value(end) + tail_size, &newdata.value(start+nnz)); - internal::smart_copy(&matrix.data().index(end), &matrix.data().index(end) + tail_size, &newdata.index(start+nnz)); + internal::smart_copy(matrix.valuePtr()+end, matrix.valuePtr()+end + tail_size, newdata.valuePtr()+start+nnz); + internal::smart_copy(matrix.innerIndexPtr()+end, matrix.innerIndexPtr()+end + tail_size, newdata.indexPtr()+start+nnz); newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz); @@ -167,14 +167,14 @@ public: // no need to realloc, simply copy the tail at its respective position and insert tmp matrix.data().resize(start + nnz + tail_size); - internal::smart_memmove(&matrix.data().value(end), &matrix.data().value(end) + tail_size, &matrix.data().value(start + nnz)); - internal::smart_memmove(&matrix.data().index(end), &matrix.data().index(end) + tail_size, &matrix.data().index(start + nnz)); + internal::smart_memmove(matrix.valuePtr()+end, matrix.valuePtr() + end+tail_size, matrix.valuePtr() + start+nnz); + internal::smart_memmove(matrix.innerIndexPtr()+end, matrix.innerIndexPtr() + end+tail_size, matrix.innerIndexPtr() + start+nnz); update_trailing_pointers = true; } - internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, &matrix.data().value(start)); - internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, &matrix.data().index(start)); + internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, matrix.valuePtr() + start); + internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, matrix.innerIndexPtr() + start); } // update outer index pointers and innerNonZeros diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index ea71b41d1..6c15b1610 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -160,7 +160,7 @@ class SparseCompressedBase::InnerIterator } explicit InnerIterator(const internal::CompressedStorage& data) - : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_id(0), m_end(data.size()) + : m_values(&data.valuePtr()), m_indices(&data.indexPtr()), m_outer(0), m_id(0), m_end(data.size()) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); } @@ -220,7 +220,7 @@ class SparseCompressedBase::ReverseInnerIterator } explicit ReverseInnerIterator(const internal::CompressedStorage& data) - : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_start(0), m_id(data.size()) + : m_values(&data.valuePtr()), m_indices(&data.indexPtr()), m_outer(0), m_start(0), m_id(data.size()) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); } diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 8b57445e6..760e151eb 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -140,20 +140,20 @@ class SparseMatrix /** \returns a const pointer to the array of values. * This function is aimed at interoperability with other libraries. * \sa innerIndexPtr(), outerIndexPtr() */ - inline const Scalar* valuePtr() const { return &m_data.value(0); } + inline const Scalar* valuePtr() const { return m_data.valuePtr(); } /** \returns a non-const pointer to the array of values. * This function is aimed at interoperability with other libraries. * \sa innerIndexPtr(), outerIndexPtr() */ - inline Scalar* valuePtr() { return &m_data.value(0); } + inline Scalar* valuePtr() { return m_data.valuePtr(); } /** \returns a const pointer to the array of inner indices. * This function is aimed at interoperability with other libraries. * \sa valuePtr(), outerIndexPtr() */ - inline const StorageIndex* innerIndexPtr() const { return &m_data.index(0); } + inline const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); } /** \returns a non-const pointer to the array of inner indices. * This function is aimed at interoperability with other libraries. * \sa valuePtr(), outerIndexPtr() */ - inline StorageIndex* innerIndexPtr() { return &m_data.index(0); } + inline StorageIndex* innerIndexPtr() { return m_data.indexPtr(); } /** \returns a const pointer to the array of the starting positions of the inner vectors. * This function is aimed at interoperability with other libraries. @@ -740,8 +740,8 @@ class SparseMatrix { eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES"); this->m_data.resize(rows()); - Eigen::Map(&this->m_data.index(0), rows()).setLinSpaced(0, StorageIndex(rows()-1)); - Eigen::Map(&this->m_data.value(0), rows()).setOnes(); + Eigen::Map(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1)); + Eigen::Map(this->m_data.valuePtr(), rows()).setOnes(); Eigen::Map(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows())); std::free(m_innerNonZeros); m_innerNonZeros = 0; diff --git a/Eigen/src/SparseCore/SparseRedux.h b/Eigen/src/SparseCore/SparseRedux.h index 50ebb2e53..2a9718cfb 100644 --- a/Eigen/src/SparseCore/SparseRedux.h +++ b/Eigen/src/SparseCore/SparseRedux.h @@ -30,7 +30,7 @@ typename internal::traits >::Scalar SparseMatrix<_Scalar,_Options,_Index>::sum() const { eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix"); - return Matrix::Map(&m_data.value(0), m_data.size()).sum(); + return Matrix::Map(m_data.valuePtr(), m_data.size()).sum(); } template @@ -38,7 +38,7 @@ typename internal::traits >::Scalar SparseVector<_Scalar,_Options,_Index>::sum() const { eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix"); - return Matrix::Map(&m_data.value(0), m_data.size()).sum(); + return Matrix::Map(m_data.valuePtr(), m_data.size()).sum(); } } // end namespace Eigen diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 40a754300..167a9886c 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -83,11 +83,11 @@ class SparseVector EIGEN_STRONG_INLINE Index innerSize() const { return m_size; } EIGEN_STRONG_INLINE Index outerSize() const { return 1; } - EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); } - EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); } + EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return m_data.valuePtr(); } + EIGEN_STRONG_INLINE Scalar* valuePtr() { return m_data.valuePtr(); } - EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return &m_data.index(0); } - EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return &m_data.index(0); } + EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); } + EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return m_data.indexPtr(); } inline const StorageIndex* outerIndexPtr() const { return 0; } inline StorageIndex* outerIndexPtr() { return 0; } @@ -125,6 +125,7 @@ class SparseVector inline Scalar& coeffRef(Index i) { eigen_assert(i>=0 && i Date: Sat, 27 Feb 2016 20:22:04 +0000 Subject: Improved the README --- bench/tensors/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/README b/bench/tensors/README index 1de0a5786..4398aa81b 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -8,5 +8,5 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. -nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_gpu +nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu -- cgit v1.2.3 From 609b3337a701e322d0e33089b826d1ac7c2a11fe Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 27 Feb 2016 20:42:57 +0000 Subject: Print some information to stderr when a CUDA kernel fails --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index c01704e56..821835cf3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -211,8 +211,12 @@ struct GpuDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) cudaError_t err = cudaStreamSynchronize(stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + if (err != cudaSuccess) { + std::cerr << "Error detected in CUDA stream: " + << cudaGetErrorString(err) + << std::endl; + assert(err == cudaSuccess); + } #else assert(false && "The default device should be used instead to generate kernel code"); #endif -- cgit v1.2.3 From e9bea614ecb6d910948e36b11483bbb0c0f83f76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 29 Feb 2016 10:31:27 +0100 Subject: Fix shortcoming in fixed-value deduction of startRow/startCol --- Eigen/src/Core/Block.h | 8 ++++---- Eigen/src/Core/CoreEvaluators.h | 4 ++-- test/block.cpp | 5 +++++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 2f46c878d..11de45c2e 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -318,8 +318,8 @@ template m_startRow; - const internal::variable_if_dynamic m_startCol; + const internal::variable_if_dynamic m_startRow; + const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; const internal::variable_if_dynamic m_blockCols; }; @@ -440,8 +440,8 @@ class BlockImpl_dense } XprTypeNested m_xpr; - const internal::variable_if_dynamic m_startRow; - const internal::variable_if_dynamic m_startCol; + const internal::variable_if_dynamic m_startRow; + const internal::variable_if_dynamic m_startCol; Index m_outerStride; }; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index a729e0454..388805f0d 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -865,8 +865,8 @@ struct unary_evaluator, IndexBa protected: evaluator m_argImpl; - const variable_if_dynamic m_startRow; - const variable_if_dynamic m_startCol; + const variable_if_dynamic m_startRow; + const variable_if_dynamic m_startCol; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/test/block.cpp b/test/block.cpp index 3b77b704a..1eeb2da27 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -181,6 +181,11 @@ template void block(const MatrixType& m) dm = m1.row(r1).segment(c1,c2-c1+1).transpose(); dv = m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0); VERIFY_IS_EQUAL(dv, dm); + + VERIFY_IS_EQUAL( (m1.template block(1,0,0,1)), m1.block(1,0,0,1)); + VERIFY_IS_EQUAL( (m1.template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0)); + VERIFY_IS_EQUAL( ((m1*1).template block(1,0,0,1)), m1.block(1,0,0,1)); + VERIFY_IS_EQUAL( ((m1*1).template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0)); } -- cgit v1.2.3 From 328484204559b3ae89c6131e65bdc397a17e0275 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 29 Feb 2016 10:48:16 -0800 Subject: Optimized the performance of narrow reductions on CUDA devices --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 ++---- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index f7c1a5cf4..88d51e5f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -544,8 +544,7 @@ struct TensorEvaluator, Device> const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return false; + return internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); } bool preserving_inner_dims = true; @@ -561,8 +560,7 @@ struct TensorEvaluator, Device> const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return false; + return internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); } } return true; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 2da18b147..c3b1b8b7a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -230,9 +230,14 @@ struct InnerReducer { assert(false && "Should only be called to reduce floats on a gpu device"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; @@ -255,6 +260,8 @@ struct InnerReducer { LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; } }; @@ -301,9 +308,14 @@ struct OuterReducer { assert(false && "Should only be called to reduce floats on a gpu device"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; @@ -326,6 +338,8 @@ struct OuterReducer { LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; } }; -- cgit v1.2.3 From b2075cb7a2d321a11f2c9b96877eaf2d49dc1b25 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 29 Feb 2016 10:53:38 -0800 Subject: Made the signature of the inner and outer reducers consistent --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 88d51e5f0..d01a63ccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -345,8 +345,9 @@ template struct InnerReducer { static const bool HasOptimizedImplementation = false; - EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); + return true; } }; @@ -355,8 +356,9 @@ template struct OuterReducer { static const bool HasOptimizedImplementation = false; - EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); + return true; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index c3b1b8b7a..bad5c1425 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -226,8 +226,9 @@ struct InnerReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); + return true; } static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { @@ -304,8 +305,9 @@ struct OuterReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); + return true; } static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { -- cgit v1.2.3 From 56a3ada6701b8e8645df4e00a2ef93d45a4f970a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 29 Feb 2016 14:57:52 -0800 Subject: Added benchmarks for full reduction --- bench/tensors/tensor_benchmarks.h | 26 ++++++++++++++++++++++++-- bench/tensors/tensor_benchmarks_gpu.cu | 1 + 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 131d056b4..d916f787e 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -297,7 +297,8 @@ template class BenchmarkSuite { input_size[0] = k_; input_size[1] = n_; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size = {{n_}}; + Eigen::array output_size; + output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST @@ -325,7 +326,8 @@ template class BenchmarkSuite { input_size[1] = n_; const TensorMap, Eigen::Aligned> B( b_, input_size); - const Eigen::array output_size = {{k_}}; + Eigen::array output_size; + output_size[0] = k_; TensorMap, Eigen::Aligned> C( c_, output_size); @@ -347,6 +349,26 @@ template class BenchmarkSuite { finalizeBenchmark(static_cast(k_) * n_ * num_iters); } + // Full reduction + void fullReduction(int num_iters) { + Eigen::array input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap, Eigen::Aligned> B( + b_, input_size); + const Eigen::array output_size; + TensorMap, Eigen::Aligned> C( + c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(static_cast(k_) * n_ * num_iters); + } + // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { Eigen::array sizeA; diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index a6f594382..76d68c5c1 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); // Contractions -- cgit v1.2.3 From 68ac5c1738083796084fb554c5d167056bb92fc8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 29 Feb 2016 18:11:58 -0800 Subject: Improved the performance of large outer reductions on cuda --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index bad5c1425..444766f96 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -281,7 +281,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu } // Do the reduction. - const Index max_iter = num_preserved_coeffs * numext::maxi(1, (num_coeffs_to_reduce - NumPerThread + 1)); + const Index max_iter = num_preserved_coeffs * divup(num_coeffs_to_reduce, NumPerThread); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; -- cgit v1.2.3 From bee9efc20315d5c6095e8a5ebb1bd7854bf3facf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Mar 2016 12:47:27 +0100 Subject: Compilation fix --- Eigen/src/SparseCore/SparseCompressedBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index 6c15b1610..c236d1397 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -160,7 +160,7 @@ class SparseCompressedBase::InnerIterator } explicit InnerIterator(const internal::CompressedStorage& data) - : m_values(&data.valuePtr()), m_indices(&data.indexPtr()), m_outer(0), m_id(0), m_end(data.size()) + : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_id(0), m_end(data.size()) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); } -- cgit v1.2.3 From dfa80b206067d96a88bc06bf45ad6216e3432018 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Mar 2016 12:48:56 +0100 Subject: Compilation fix --- Eigen/src/SparseCore/SparseCompressedBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index c236d1397..15854a73b 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -220,7 +220,7 @@ class SparseCompressedBase::ReverseInnerIterator } explicit ReverseInnerIterator(const internal::CompressedStorage& data) - : m_values(&data.valuePtr()), m_indices(&data.indexPtr()), m_outer(0), m_start(0), m_id(data.size()) + : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_start(0), m_id(data.size()) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); } -- cgit v1.2.3 From 3fccef6f50f365a10d68c0b71e305f36329b5e03 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 2 Mar 2016 13:22:46 +0100 Subject: bug #537: fix compilation with Apples's compiler --- Eigen/src/Geometry/Quaternion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index 32e7e76fa..32d1499c6 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -277,7 +277,7 @@ public: inline Coefficients& coeffs() { return m_coeffs;} inline const Coefficients& coeffs() const { return m_coeffs;} - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsAlignment) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment)) #ifdef EIGEN_QUATERNION_PLUGIN # include EIGEN_QUATERNION_PLUGIN -- cgit v1.2.3 From 6afea4683847f1408d6de72b886af59576c66c8d Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 2 Mar 2016 21:35:48 -0800 Subject: Add infinity() support to numext::numeric_limits, use it in lgamma. This makes the infinity access a __device__ function, removing nvcc warnings. --- Eigen/src/Core/NumTraits.h | 5 +++++ Eigen/src/Core/SpecialFunctions.h | 2 +- Eigen/src/Core/util/Meta.h | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 2ea5eb272..6a596bb7d 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -90,6 +90,11 @@ template struct GenericNumTraits static inline T lowest() { return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); } + + EIGEN_DEVICE_FUNC + static inline T infinity() { + return numext::numeric_limits::infinity(); + } }; template struct NumTraits : GenericNumTraits diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 6b4598e3e..b02ad9a1f 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -283,7 +283,7 @@ struct digamma_impl { Scalar p, q, nz, s, w, y; bool negative; - const Scalar maxnum = std::numeric_limits::infinity(); + const Scalar maxnum = numext::numeric_limits::infinity(); const Scalar m_pi = 3.14159265358979323846; negative = 0; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index b01437d88..1cab8278c 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -147,6 +147,7 @@ template struct numeric_limits static T epsilon() { return 0; } static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } + static T infinity() { assert(false && "Infinity not supported for this type"); } }; template<> struct numeric_limits { @@ -156,6 +157,8 @@ template<> struct numeric_limits static float (max)() { return CUDART_MAX_NORMAL_F; } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } + EIGEN_DEVICE_FUNC + static float infinity() { return CUDART_INF_F; } }; template<> struct numeric_limits { @@ -165,6 +168,8 @@ template<> struct numeric_limits static double (max)() { return DBL_MAX; } EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } + EIGEN_DEVICE_FUNC + static float infinity() { return CUDART_INF; } }; template<> struct numeric_limits { -- cgit v1.2.3 From ab3dc0b0fe64c34fab110f15914b0b9fcc0329da Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 2 Mar 2016 21:48:46 -0800 Subject: Small bugfix to numeric_limits for CUDA. --- Eigen/src/Core/util/Meta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 1cab8278c..6b35179f2 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -169,7 +169,7 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF; } + static double infinity() { return CUDART_INF; } }; template<> struct numeric_limits { -- cgit v1.2.3 From 1da10a73580b3f3b672397ad65cded9300535ac7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Mar 2016 10:33:20 -0800 Subject: Enable the conversion between floats and half floats on older GPUs that support it. --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 2742a4e7b..b59b42170 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -21,7 +21,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(a); #else assert(false && "tbd"); @@ -40,7 +40,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(static_cast(a)); #else assert(false && "tbd"); @@ -59,7 +59,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(a); #else assert(false && "tbd"); @@ -85,7 +85,7 @@ struct type_casting_traits { }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float2 r1 = __half22float2(a); float2 r2 = __half22float2(b); return make_float4(r1.x, r1.y, r2.x, r2.y); @@ -106,7 +106,7 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { // Simply discard the second half of the input -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); #else assert(false && "tbd"); -- cgit v1.2.3 From 1032441c6fea0a0d98b394abe8ffdb228256f47b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Mar 2016 10:34:20 -0800 Subject: Enable partial support for half floats on Kepler GPUs. --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 142 ++++++++++++++++-------------- 1 file changed, 78 insertions(+), 64 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 1a1b4ec3d..720155ce1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,8 +17,10 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +// The following operations require arch >= 5.3 +#if __CUDA_ARCH__ >= 530 __device__ half operator + (const half& a, const half& b) { return __hadd(a, b); } @@ -60,6 +62,7 @@ __device__ half abs(const half& a) { return result; } } +#endif namespace Eigen { namespace internal { @@ -98,8 +101,79 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& return __half2half2(from); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { +#if __CUDA_ARCH__ >= 320 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { +#if __CUDA_ARCH__ >= 320 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +// The following operations require arch >= 5.3 +#if __CUDA_ARCH__ >= 530 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { - return __halves2half2(a, __hadd(a, __float2half(1))); + return __halves2half2(a, __hadd(a, __float2half(1.0f))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { @@ -140,7 +214,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& float b1 = __low2float(b); float b2 = __high2float(b); half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } @@ -154,50 +228,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& return __halves2half2(r1, r2); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { - to[0] = __low2half(from); - to[1] = __high2half(from); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { - return __ldg((const half2*)from); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { - return __halves2half2(__ldg(from+0), __ldg(from+1)); -} - -template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { - return __low2half(a); -} - template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { return __hadd(__low2half(a), __high2half(a)); } @@ -217,23 +247,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { return __hmul(__low2half(a), __high2half(a)); } - -template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { - half2 result; - result.x = a.x & 0x7FFF7FFF; - return result; -} - - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - half a1 = __low2half(kernel.packet[0]); - half a2 = __high2half(kernel.packet[0]); - half b1 = __low2half(kernel.packet[1]); - half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} +#endif } // end namespace internal -- cgit v1.2.3 From dac58d7c3599a1c5c7631a734ca95a60ddb549ef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Mar 2016 10:37:25 -0800 Subject: Added a test to validate the conversion of half floats into floats on Kepler GPUs. Restricted the testing of the random number generation code to GPU architecture greater than or equal to 3.5. --- unsupported/test/CMakeLists.txt | 9 ++- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 73 ++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_cast_float16_cuda.cu diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3be43f47f..bc9248b9e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -170,10 +170,15 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_cuda) ei_add_test(cxx11_tensor_contract_cuda) ei_add_test(cxx11_tensor_reduction_cuda) - ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_argmax_cuda) + ei_add_test(cxx11_tensor_cast_float16_cuda) - # Half floats are only supported starting with arch 5.3 + # The random number generation code requires arch 3.5 or greater. + if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) + ei_add_test(cxx11_tensor_random_cuda) + endif() + + # Operations other that casting of half floats are only supported starting with arch 5.3 if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52) ei_add_test(cxx11_tensor_of_float16_cuda) endif() diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu new file mode 100644 index 000000000..7936a9126 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -0,0 +1,73 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +#ifdef EIGEN_HAS_CUDA_FP16 + +void test_cuda_conversion() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + Tensor floats(num_elem); + floats.setRandom(); + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); + + gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_conv.device(gpu_device) = gpu_half.cast(); + + Tensor initial(num_elem); + Tensor final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + +#endif + + +void test_cxx11_tensor_cast_float16_cuda() +{ +#ifdef EIGEN_HAS_CUDA_FP16 + CALL_SUBTEST(test_cuda_conversion()); +#else + std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; +#endif +} -- cgit v1.2.3 From 5cf4558c0a139b4f0b6fc0c778fe596def93dd94 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Mar 2016 12:36:55 -0800 Subject: Added support for rounding, flooring, and ceiling to the tensor api --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 4dea1d3a0..5679e58cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -232,6 +232,25 @@ class TensorBase return TensorConversionOp(derived()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp -- cgit v1.2.3 From deea866bbd72ee7f8ae7ecd31a3d5e96e60269bf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Mar 2016 12:38:02 -0800 Subject: Added tests to cover the new rounding, flooring and ceiling tensor operations. --- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_roundings.cpp | 62 +++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_roundings.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index bc9248b9e..cb056690a 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -137,6 +137,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_ref "-std=c++0x") ei_add_test(cxx11_tensor_random "-std=c++0x") ei_add_test(cxx11_tensor_casts "-std=c++0x") + ei_add_test(cxx11_tensor_roundings "-std=c++0x") ei_add_test(cxx11_tensor_reverse "-std=c++0x") ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") ei_add_test(cxx11_tensor_io "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp new file mode 100644 index 000000000..2c26151ab --- /dev/null +++ b/unsupported/test/cxx11_tensor_roundings.cpp @@ -0,0 +1,62 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_float_rounding() +{ + Tensor ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor result = ftensor.round(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j))); + } + } +} + +static void test_float_flooring() +{ + Tensor ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor result = ftensor.floor(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j))); + } + } +} + +static void test_float_ceiling() +{ + Tensor ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor result = ftensor.ceil(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j))); + } + } +} + +void test_cxx11_tensor_roundings() +{ + CALL_SUBTEST(test_float_rounding()); + CALL_SUBTEST(test_float_ceiling()); + CALL_SUBTEST(test_float_flooring()); +} -- cgit v1.2.3 From 7ea35bfa1c0b4950feae65d49c0e6f2cbf3691d9 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 3 Mar 2016 19:39:41 -0800 Subject: Initial implementation of igamma and igammac. --- Eigen/src/Core/GenericPacketMath.h | 10 + Eigen/src/Core/GlobalFunctions.h | 30 +++ Eigen/src/Core/NumTraits.h | 5 + Eigen/src/Core/SpecialFunctions.h | 291 +++++++++++++++++++++++++++++- Eigen/src/Core/arch/CUDA/MathFunctions.h | 18 ++ Eigen/src/Core/arch/CUDA/PacketMath.h | 4 + Eigen/src/Core/functors/BinaryFunctors.h | 49 +++++ Eigen/src/Core/util/ForwardDeclarations.h | 2 + Eigen/src/Core/util/Meta.h | 5 + test/array.cpp | 47 ++++- 10 files changed, 459 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 02882bdea..ead0253df 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -78,6 +78,8 @@ struct default_packet_traits HasDiGamma = 0, HasErf = 0, HasErfc = 0, + HasIGamma = 0, + HasIGammac = 0, HasRound = 0, HasFloor = 0, @@ -457,6 +459,14 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 396da8e71..7df0fdda9 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -129,6 +129,36 @@ namespace Eigen ); } + /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + + /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + namespace internal { EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 6a596bb7d..7ddb4a867 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -95,6 +95,11 @@ template struct GenericNumTraits static inline T infinity() { return numext::numeric_limits::infinity(); } + + EIGEN_DEVICE_FUNC + static inline T quiet_NaN() { + return numext::numeric_limits::quiet_NaN(); + } }; template struct NumTraits : GenericNumTraits diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index b02ad9a1f..ff2146afc 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -283,7 +283,7 @@ struct digamma_impl { Scalar p, q, nz, s, w, y; bool negative; - const Scalar maxnum = numext::numeric_limits::infinity(); + const Scalar maxnum = NumTraits::infinity(); const Scalar m_pi = 3.14159265358979323846; negative = 0; @@ -401,6 +401,282 @@ struct erfc_impl { }; #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral) * + ****************************************************************************/ + +template +struct igammac_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template struct igamma_impl; // predeclare igamma_impl + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = NumTraits::epsilon(); + const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar big = one / machep; + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if ((x <= zero) || ( a <= zero)) { + return one; + } + + if ((x < one) || (x < a)) { + return (one - igamma_impl::run(a, x)); + } + + ax = a * ::log(x) - x - lgamma_impl::run(a); + if( ax < -maxlog ) { // underflow + return zero; + } + ax = ::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1/qkm1; + + do { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if( qk != zero ) { + r = pk/qk; + t = ::abs( (ans - r)/r ); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (::abs(pk) > big) { + pkm2 *= machep; + pkm1 *= machep; + qkm2 *= machep; + qkm1 *= machep; + } + } while( t > machep ); + + return ( ans * ax ); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of igamma (incomplete gamma integral) * + ****************************************************************************/ + +template +struct igamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = NumTraits::epsilon(); + const Scalar maxlog = ::log(NumTraits::highest()); + + double ans, ax, c, r; + + if( (x <= zero) || ( a <= zero) ) { + return zero; + } + + if( (x > one) && (x > a ) ) { + return (one - igammac_impl::run(a,x)); + } + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * ::log(x) - x - lgamma_impl::run(a); + if( ax < -maxlog ) { + // underflow + return zero; + } + ax = ::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + do { + r += one; + c *= x/r; + ans += c; + } while( c/ans > machep ); + + return( ans * ax/a ); + } +}; + +#endif // EIGEN_HAS_C99_MATH + } // end namespace internal namespace numext { @@ -429,8 +705,21 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); } +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + } // end namespace numext + } // end namespace Eigen #endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index a2c06a817..6e84d3af8 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -116,6 +116,24 @@ double2 perfc(const double2& a) return make_double2(erfc(a.x), erfc(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma(const float4& a, const float4& x) +{ + using numext::pigamma; + return make_float4( + pigamma(a.x, x.x), + pigamma(a.y, x.y), + pigamma(a.z, x.z), + pigamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double& x) +{ + using numext::pigammac; + return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y)); +} + #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d3d9f910e..d2563030b 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -43,6 +43,8 @@ template<> struct packet_traits : default_packet_traits HasDiGamma = 1, HasErf = 1, HasErfc = 1, + HasIgamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -67,6 +69,8 @@ template<> struct packet_traits : default_packet_traits HasDiGamma = 1, HasErf = 1, HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, HasBlend = 0, }; diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 4962d625c..5cdfff845 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -337,6 +337,55 @@ template<> struct functor_traits { }; }; +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template struct scalar_igamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template struct scalar_igammac_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGammac + }; +}; //---------- binary functors bound to a constant, thus appearing as a unary functor ---------- diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index f09632375..a102e5457 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -206,6 +206,8 @@ template struct scalar_add_op; template struct scalar_constant_op; template struct scalar_identity_op; template struct scalar_sign_op; +template struct scalar_igamma_op; +template struct scalar_igammac_op; template struct scalar_product_op; template struct scalar_multiple2_op; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6b35179f2..24e8a6d8a 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -148,6 +148,7 @@ template struct numeric_limits static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } + static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } }; template<> struct numeric_limits { @@ -159,6 +160,8 @@ template<> struct numeric_limits static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC static float infinity() { return CUDART_INF_F; } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { return CUDART_NAN_F; } }; template<> struct numeric_limits { @@ -170,6 +173,8 @@ template<> struct numeric_limits static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC static double infinity() { return CUDART_INF; } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { return CUDART_NAN; } }; template<> struct numeric_limits { diff --git a/test/array.cpp b/test/array.cpp index 96aef31c7..a37874cc2 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -295,7 +295,6 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square()); VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square()); VERIFY_IS_APPROX(pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0))); - VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt()); VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt()); @@ -305,6 +304,14 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(log10(m3), log(m3)/log(10)); + // Smoke test to check any compilation issues + ArrayType m1_abs_p1 = m1.abs() + 1; + ArrayType m2_abs_p1 = m2.abs() + 1; + VERIFY_IS_APPROX(Eigen::igamma(m1_abs_p1, m2_abs_p1), Eigen::igamma(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m1_abs_p1, m2_abs_p1), Eigen::igammac(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igamma(m2_abs_p1, m1_abs_p1), Eigen::igamma(m2_abs_p1, m1_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m2_abs_p1, m1_abs_p1), Eigen::igammac(m2_abs_p1, m1_abs_p1)); + // scalar by array division const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); @@ -323,6 +330,44 @@ template void array_real(const ArrayType& m) std::numeric_limits::infinity()); VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + + // location i*6+j corresponds to a_s[i], x_s[j]. + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = { + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, + 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, + 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, + 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, + 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}}; + Scalar igammac_s[][6] = { + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl; + //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " << + //igammac_s[i][j] << std::endl; + std::cout << a_s[i] << ", " << x_s[j] << std::endl; + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } + } } #endif // EIGEN_HAS_C99_MATH -- cgit v1.2.3 From 2c50fc878efc0033f2d39218c0b4e538ba93e271 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:09:38 -0800 Subject: Fixed a typo --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ae0de9420..78d6da28a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -38,10 +38,10 @@ namespace { #elif EIGEN_COMP_MSVC DWORD leading_zeros = 0; if (sizeof(T) == 8) { - _BitScanReverse64(&leading_zero, val); + _BitScanReverse64(&leading_zeros, val); } else { - _BitScanReverse(&leading_zero, val); + _BitScanReverse(&leading_zeros, val); } return leading_zeros; #else -- cgit v1.2.3 From 174edf976b11a7089dd4d77264072a98afe9c607 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:11:13 -0800 Subject: Made the contraction test more portable --- unsupported/test/cxx11_tensor_contraction.cpp | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index b0d52c6cf..57ec5add7 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -29,7 +29,7 @@ static void test_evals() Tensor mat4(3,3); mat4.setZero(); - Eigen::array dims3({{DimPair(0, 0)}}); + Eigen::array dims3 = {{DimPair(0, 0)}}; typedef TensorEvaluator Evaluator; Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice()); eval.evalTo(mat4.data()); @@ -49,7 +49,7 @@ static void test_evals() Tensor mat5(2,2); mat5.setZero(); - Eigen::array dims4({{DimPair(1, 1)}}); + Eigen::array dims4 = {{DimPair(1, 1)}}; typedef TensorEvaluator Evaluator2; Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice()); eval2.evalTo(mat5.data()); @@ -64,7 +64,7 @@ static void test_evals() Tensor mat6(2,2); mat6.setZero(); - Eigen::array dims6({{DimPair(1, 0)}}); + Eigen::array dims6 = {{DimPair(1, 0)}}; typedef TensorEvaluator Evaluator3; Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice()); eval3.evalTo(mat6.data()); @@ -89,7 +89,7 @@ static void test_scalar() Tensor scalar(1); scalar.setZero(); - Eigen::array dims({{DimPair(0, 0)}}); + Eigen::array dims = {{DimPair(0, 0)}}; typedef TensorEvaluator Evaluator; Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); eval.evalTo(scalar.data()); @@ -113,7 +113,7 @@ static void test_multidims() Tensor mat3(2, 2, 2); mat3.setZero(); - Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); + Eigen::array dims = {{DimPair(1, 2), DimPair(2, 3)}}; typedef TensorEvaluator Evaluator; Evaluator eval(mat1.contract(mat2, dims), DefaultDevice()); eval.evalTo(mat3.data()); @@ -147,7 +147,7 @@ static void test_holes() { t1.setRandom(); t2.setRandom(); - Eigen::array dims({{DimPair(0, 0), DimPair(3, 4)}}); + Eigen::array dims = {{DimPair(0, 0), DimPair(3, 4)}}; Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 5); VERIFY_IS_EQUAL(result.dimension(1), 7); @@ -182,7 +182,7 @@ static void test_full_redux() t1.setRandom(); t2.setRandom(); - Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); + Eigen::array dims = {{DimPair(0, 0), DimPair(1, 1)}}; Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) @@ -212,7 +212,7 @@ static void test_contraction_of_contraction() t3.setRandom(); t4.setRandom(); - Eigen::array dims({{DimPair(1, 0)}}); + Eigen::array dims = {{DimPair(1, 0)}}; auto contract1 = t1.contract(t2, dims); auto diff = t3 - contract1; auto contract2 = t1.contract(t4, dims); @@ -243,7 +243,7 @@ static void test_expr() Tensor mat3(2,2); - Eigen::array dims({{DimPair(1, 0)}}); + Eigen::array dims = {{DimPair(1, 0)}}; mat3 = mat1.contract(mat2, dims); VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); @@ -263,7 +263,7 @@ static void test_out_of_order_contraction() Tensor mat3(2, 2); - Eigen::array dims({{DimPair(2, 0), DimPair(0, 2)}}); + Eigen::array dims = {{DimPair(2, 0), DimPair(0, 2)}}; mat3 = mat1.contract(mat2, dims); VERIFY_IS_APPROX(mat3(0, 0), @@ -279,7 +279,7 @@ static void test_out_of_order_contraction() mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); - Eigen::array dims2({{DimPair(0, 2), DimPair(2, 0)}}); + Eigen::array dims2 = {{DimPair(0, 2), DimPair(2, 0)}}; mat3 = mat1.contract(mat2, dims2); VERIFY_IS_APPROX(mat3(0, 0), @@ -311,8 +311,8 @@ static void test_consistency() Tensor mat4(2, 1, 5, 5); // contract on dimensions of size 4 and 3 - Eigen::array dims1({{DimPair(0, 4), DimPair(1, 0)}}); - Eigen::array dims2({{DimPair(4, 0), DimPair(0, 1)}}); + Eigen::array dims1 = {{DimPair(0, 4), DimPair(1, 0)}}; + Eigen::array dims2 = {{DimPair(4, 0), DimPair(0, 1)}}; mat3 = mat1.contract(mat2, dims1); mat4 = mat2.contract(mat1, dims2); @@ -354,7 +354,7 @@ static void test_large_contraction() Eigen::Matrix m_result(1500, 1400); // this contraction should be equivalent to a single matrix multiplication - Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array dims = {{DimPair(2, 0), DimPair(3, 1)}}; // compute results by separate methods t_result = t_left.contract(t_right, dims); @@ -399,10 +399,10 @@ static void test_tensor_vector() { Tensor t_left(7, 13, 17); Tensor t_right(1, 7); - + t_left.setRandom(); t_right.setRandom(); - + typedef typename Tensor::DimensionPair DimensionPair; Eigen::array dim_pair01{{{0, 1}}}; Tensor t_result = t_left.contract(t_right, dim_pair01); @@ -434,7 +434,7 @@ static void test_small_blocking_factors() Eigen::setCpuCacheSizes(896, 1920, 2944); // this contraction should be equivalent to a single matrix multiplication - Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array dims = {{DimPair(2, 0), DimPair(3, 1)}}; Tensor t_result; t_result = t_left.contract(t_right, dims); -- cgit v1.2.3 From c561eeb7bf7daece1174a98e721783c2b93f8ec5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:12:45 -0800 Subject: Don't use implicit type conversions in initializer lists since not all compilers support them. --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 128 ++++++++++++------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index a5aa05da4..867512d67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -564,74 +564,74 @@ struct TensorEvaluator, D // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; RealScalar m_sin_PI_div_n_LUT[32] = { - 0.0, - -2, - -0.999999999999999, - -0.292893218813453, - -0.0761204674887130, - -0.0192147195967696, - -0.00481527332780311, - -0.00120454379482761, - -3.01181303795779e-04, - -7.52981608554592e-05, - -1.88247173988574e-05, - -4.70619042382852e-06, - -1.17654829809007e-06, - -2.94137117780840e-07, - -7.35342821488550e-08, - -1.83835707061916e-08, - -4.59589268710903e-09, - -1.14897317243732e-09, - -2.87243293150586e-10, - -7.18108232902250e-11, - -1.79527058227174e-11, - -4.48817645568941e-12, - -1.12204411392298e-12, - -2.80511028480785e-13, - -7.01277571201985e-14, - -1.75319392800498e-14, - -4.38298482001247e-15, - -1.09574620500312e-15, - -2.73936551250781e-16, - -6.84841378126949e-17, - -1.71210344531737e-17, - -4.28025861329343e-18 + RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar( -7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18) }; // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { - 0.0, - 0.0, - -1.00000000000000e+00, - -7.07106781186547e-01, - -3.82683432365090e-01, - -1.95090322016128e-01, - -9.80171403295606e-02, - -4.90676743274180e-02, - -2.45412285229123e-02, - -1.22715382857199e-02, - -6.13588464915448e-03, - -3.06795676296598e-03, - -1.53398018628477e-03, - -7.66990318742704e-04, - -3.83495187571396e-04, - -1.91747597310703e-04, - -9.58737990959773e-05, - -4.79368996030669e-05, - -2.39684498084182e-05, - -1.19842249050697e-05, - -5.99211245264243e-06, - -2.99605622633466e-06, - -1.49802811316901e-06, - -7.49014056584716e-07, - -3.74507028292384e-07, - -1.87253514146195e-07, - -9.36267570730981e-08, - -4.68133785365491e-08, - -2.34066892682746e-08, - -1.17033446341373e-08, - -5.85167231706864e-09, - -2.92583615853432e-09 + RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09) }; }; -- cgit v1.2.3 From 4416a5dcfffe82d519123173204d616e5baecd95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:35:43 -0800 Subject: Added missing include --- unsupported/test/cxx11_tensor_reduction.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 0ec316991..6a128901a 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -9,6 +9,7 @@ #include "main.h" #include +#include #include using Eigen::Tensor; -- cgit v1.2.3 From 667fcc2b531bc100deed9c3575248790d3027ffe Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:37:51 -0800 Subject: Fixed syntax error --- unsupported/test/cxx11_tensor_random.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp index 389896c54..0f3dc5787 100644 --- a/unsupported/test/cxx11_tensor_random.cpp +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -48,7 +48,7 @@ struct MyGenerator { } // Same as above but generates several numbers at a time. - typename internal::packet_traits::type packetOp( + internal::packet_traits::type packetOp( Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { const int packetSize = internal::packet_traits::size; EIGEN_ALIGN_MAX int values[packetSize]; -- cgit v1.2.3 From 4e49fd5eb97840d66f6d16208d862200465b7397 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 14:49:18 -0800 Subject: MSVC uses __uint128 while other compilers use __uint128_t to encode 128bit unsigned integers. Make the cxx11_tensor_uint128.cpp test work in both cases. --- unsupported/test/cxx11_tensor_uint128.cpp | 39 ++++++++++++++++++------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index 424c70197..c6766c6c6 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -11,10 +11,17 @@ #include +#if EIGEN_COMP_MSVC +typedef __uint128 uint128_t; +#else +typedef __uint128_t uint128_t; +#endif + + using Eigen::internal::TensorUInt128; using Eigen::internal::static_val; -void VERIFY_EQUAL(TensorUInt128 actual, __uint128_t expected) { +void VERIFY_EQUAL(TensorUInt128 actual, uint128_t expected) { bool matchl = actual.lower() == static_cast(expected); bool matchh = actual.upper() == static_cast(expected >> 64); if (!matchl || !matchh) { @@ -32,13 +39,13 @@ void test_add() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128 i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast(i1) << 64) + static_cast(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128 j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast(j1) << 64) + static_cast(j2); TensorUInt128 actual = i + j; - __uint128_t expected = a + b; + uint128_t expected = a + b; VERIFY_EQUAL(actual, expected); } } @@ -51,13 +58,13 @@ void test_sub() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128 i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast(i1) << 64) + static_cast(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128 j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast(j1) << 64) + static_cast(j2); TensorUInt128 actual = i - j; - __uint128_t expected = a - b; + uint128_t expected = a - b; VERIFY_EQUAL(actual, expected); } } @@ -70,13 +77,13 @@ void test_mul() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128 i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast(i1) << 64) + static_cast(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128 j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast(j1) << 64) + static_cast(j2); TensorUInt128 actual = i * j; - __uint128_t expected = a * b; + uint128_t expected = a * b; VERIFY_EQUAL(actual, expected); } } @@ -89,13 +96,13 @@ void test_div() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128 i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast(i1) << 64) + static_cast(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128 j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast(j1) << 64) + static_cast(j2); TensorUInt128 actual = i / j; - __uint128_t expected = a / b; + uint128_t expected = a / b; VERIFY_EQUAL(actual, expected); } } @@ -107,10 +114,10 @@ void test_misc1() { uint64_t incr = internal::random(1, 9999999999); for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128, uint64_t> i(0, i2); - __uint128_t a = static_cast<__uint128_t>(i2); + uint128_t a = static_cast(i2); for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128, uint64_t> j(0, j2); - __uint128_t b = static_cast<__uint128_t>(j2); + uint128_t b = static_cast(j2); uint64_t actual = (i * j).upper(); uint64_t expected = (a * b) >> 64; VERIFY_IS_EQUAL(actual, expected); @@ -122,7 +129,7 @@ void test_misc2() { int64_t incr = internal::random(1, 100); for (int64_t log_div = 0; log_div < 63; ++log_div) { for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) { - uint64_t expected = (static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1; + uint64_t expected = (static_cast(1) << (64+log_div)) / static_cast(divider) - (static_cast(1) << 64) + 1; uint64_t shift = 1ULL << log_div; TensorUInt128 result = (TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - TensorUInt128, static_val<0> >(1, 0) + TensorUInt128, static_val<1> >(1)); -- cgit v1.2.3 From 60d9df11c172ecb040ce5ba08087ee4c3fd8e9e6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 16:27:02 -0800 Subject: Fixed the computation of leading zeros when compiling with msvc. --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 78d6da28a..75567f1ff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -36,14 +36,7 @@ namespace { #ifdef __CUDA_ARCH__ return (sizeof(T) == 8) ? __clzll(val) : __clz(val); #elif EIGEN_COMP_MSVC - DWORD leading_zeros = 0; - if (sizeof(T) == 8) { - _BitScanReverse64(&leading_zeros, val); - } - else { - _BitScanReverse(&leading_zeros, val); - } - return leading_zeros; + return (sizeof(T) == 8) ? __lzcnt64(val) : __lzcnt(val); #else EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); return (sizeof(T) == 8) ? -- cgit v1.2.3 From ec35068edcacee6aae67c136e6f7c26e473186bd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 16:42:38 -0800 Subject: Don't rely on the M_PI constant since not all compilers provide it. --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 867512d67..ece2695ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -225,8 +225,9 @@ struct TensorEvaluator, D // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 pos_j_base_powered[0] = ComplexScalar(1, 0); if (line_len > 1) { + const RealScalar PI(3.14159265358979323846); const ComplexScalar pos_j_base = ComplexScalar( - std::cos(M_PI / line_len), std::sin(M_PI / line_len)); + std::cos(PI / line_len), std::sin(PI / line_len)); pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; -- cgit v1.2.3 From c23e0be18f7b13f476b5b118986a36bf5d24ada5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Mar 2016 20:18:01 -0800 Subject: Use the CMAKE_CXX_STANDARD variable to turn on cxx11 --- unsupported/test/CMakeLists.txt | 100 ++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cb056690a..76a517c40 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -99,59 +99,61 @@ ei_add_test(kronecker_product) if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. - ei_add_test(cxx11_meta "-std=c++0x") - ei_add_test(cxx11_tensor_simple "-std=c++0x") -# ei_add_test(cxx11_tensor_symmetry "-std=c++0x") - ei_add_test(cxx11_tensor_assign "-std=c++0x") - ei_add_test(cxx11_tensor_dimension "-std=c++0x") - ei_add_test(cxx11_tensor_index_list "-std=c++0x") - ei_add_test(cxx11_tensor_mixed_indices "-std=c++0x") - ei_add_test(cxx11_tensor_comparisons "-std=c++0x") - ei_add_test(cxx11_tensor_contraction "-std=c++0x") - ei_add_test(cxx11_tensor_convolution "-std=c++0x") - ei_add_test(cxx11_tensor_expr "-std=c++0x") - ei_add_test(cxx11_tensor_math "-std=c++0x") - ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") - ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_const "-std=c++0x") - ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") - ei_add_test(cxx11_tensor_of_complex "-std=c++0x") - ei_add_test(cxx11_tensor_of_strings "-std=c++0x") - ei_add_test(cxx11_tensor_intdiv "-std=c++0x") - ei_add_test(cxx11_tensor_lvalue "-std=c++0x") - ei_add_test(cxx11_tensor_map "-std=c++0x") - ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") - ei_add_test(cxx11_tensor_chipping "-std=c++0x") - ei_add_test(cxx11_tensor_concatenation "-std=c++0x") - ei_add_test(cxx11_tensor_inflation "-std=c++0x") - ei_add_test(cxx11_tensor_morphing "-std=c++0x") - ei_add_test(cxx11_tensor_padding "-std=c++0x") - ei_add_test(cxx11_tensor_patch "-std=c++0x") - ei_add_test(cxx11_tensor_image_patch "-std=c++0x") - ei_add_test(cxx11_tensor_volume_patch "-std=c++0x") - ei_add_test(cxx11_tensor_reduction "-std=c++0x") - ei_add_test(cxx11_tensor_argmax "-std=c++0x") - ei_add_test(cxx11_tensor_shuffling "-std=c++0x") - ei_add_test(cxx11_tensor_striding "-std=c++0x") + set(CMAKE_CXX_STANDARD 11) + + ei_add_test(cxx11_meta) + ei_add_test(cxx11_tensor_simple) +# ei_add_test(cxx11_tensor_symmetry) + ei_add_test(cxx11_tensor_assign) + ei_add_test(cxx11_tensor_dimension) + ei_add_test(cxx11_tensor_index_list) + ei_add_test(cxx11_tensor_mixed_indices) + ei_add_test(cxx11_tensor_comparisons) + ei_add_test(cxx11_tensor_contraction) + ei_add_test(cxx11_tensor_convolution) + ei_add_test(cxx11_tensor_expr) + ei_add_test(cxx11_tensor_math) + ei_add_test(cxx11_tensor_forced_eval) + ei_add_test(cxx11_tensor_fixed_size) + ei_add_test(cxx11_tensor_const) + ei_add_test(cxx11_tensor_of_const_values) + ei_add_test(cxx11_tensor_of_complex) + ei_add_test(cxx11_tensor_of_strings) + ei_add_test(cxx11_tensor_intdiv) + ei_add_test(cxx11_tensor_lvalue) + ei_add_test(cxx11_tensor_map) + ei_add_test(cxx11_tensor_broadcasting) + ei_add_test(cxx11_tensor_chipping) + ei_add_test(cxx11_tensor_concatenation) + ei_add_test(cxx11_tensor_inflation) + ei_add_test(cxx11_tensor_morphing) + ei_add_test(cxx11_tensor_padding) + ei_add_test(cxx11_tensor_patch) + ei_add_test(cxx11_tensor_image_patch) + ei_add_test(cxx11_tensor_volume_patch) + ei_add_test(cxx11_tensor_reduction) + ei_add_test(cxx11_tensor_argmax) + ei_add_test(cxx11_tensor_shuffling) + ei_add_test(cxx11_tensor_striding) ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") - ei_add_test(cxx11_tensor_ref "-std=c++0x") - ei_add_test(cxx11_tensor_random "-std=c++0x") - ei_add_test(cxx11_tensor_casts "-std=c++0x") - ei_add_test(cxx11_tensor_roundings "-std=c++0x") - ei_add_test(cxx11_tensor_reverse "-std=c++0x") - ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") - ei_add_test(cxx11_tensor_io "-std=c++0x") - ei_add_test(cxx11_tensor_generator "-std=c++0x") - ei_add_test(cxx11_tensor_custom_op "-std=c++0x") - ei_add_test(cxx11_tensor_custom_index "-std=c++0x") - ei_add_test(cxx11_tensor_sugar "-std=c++0x") - ei_add_test(cxx11_tensor_fft "-std=c++0x") - ei_add_test(cxx11_tensor_ifft "-std=c++0x") - ei_add_test(cxx11_tensor_empty "-std=c++0x") + ei_add_test(cxx11_tensor_ref) + ei_add_test(cxx11_tensor_random) + ei_add_test(cxx11_tensor_casts) + ei_add_test(cxx11_tensor_roundings) + ei_add_test(cxx11_tensor_reverse) + ei_add_test(cxx11_tensor_layout_swap) + ei_add_test(cxx11_tensor_io) + ei_add_test(cxx11_tensor_generator) + ei_add_test(cxx11_tensor_custom_op) + ei_add_test(cxx11_tensor_custom_index) + ei_add_test(cxx11_tensor_sugar) + ei_add_test(cxx11_tensor_fft) + ei_add_test(cxx11_tensor_ifft) + ei_add_test(cxx11_tensor_empty) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") # This test requires __uint128_t which is only available on 64bit systems - ei_add_test(cxx11_tensor_uint128 "-std=c++0x") + ei_add_test(cxx11_tensor_uint128) endif() endif() -- cgit v1.2.3 From 0b9e0abc96d5c0367ee6c443f71754637b0db7e4 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Fri, 4 Mar 2016 21:12:10 -0800 Subject: Make igamma and igammac work correctly. This required replacing ::abs with std::abs. Modified some unit tests. --- Eigen/src/Core/SpecialFunctions.h | 119 +++++++++++++++++++++++++++----------- test/array.cpp | 64 ++++++++++---------- 2 files changed, 119 insertions(+), 64 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index ff2146afc..4a61325d4 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -296,7 +296,8 @@ struct digamma_impl { if (x <= zero) { negative = one; q = x; - p = ::floor(q); + using std::floor; + p = floor(q); if (p == q) { return maxnum; } @@ -309,7 +310,8 @@ struct digamma_impl { p += one; nz = q - p; } - nz = m_pi / ::tan(m_pi * nz); + using std::tan; + nz = m_pi / tan(m_pi * nz); } else { nz = zero; @@ -327,7 +329,8 @@ struct digamma_impl { y = digamma_impl_maybe_poly::run(s); - y = ::log(s) - (half / s) - y - w; + using std::log; + y = log(s) - (half / s) - y - w; return (negative) ? y - nz : y; } @@ -426,6 +429,39 @@ struct igammac_impl { template struct igamma_impl; // predeclare igamma_impl +template +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar big() { assert(false && "big not supported for this type"); return 0.0; } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0 / (NumTraits::epsilon() / 2); + } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double big() { + return 1.0 / NumTraits::epsilon(); + } +}; + template struct igammac_impl { EIGEN_DEVICE_FUNC @@ -487,26 +523,35 @@ struct igammac_impl { const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; - const Scalar machep = NumTraits::epsilon(); + const Scalar machep = igamma_helper::machep(); const Scalar maxlog = ::log(NumTraits::highest()); - const Scalar big = one / machep; + const Scalar big = igamma_helper::big(); + const Scalar biginv = 1 / big; + const Scalar nan = NumTraits::quiet_NaN(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; - if ((x <= zero) || ( a <= zero)) { - return one; + if ((x < zero) || ( a <= zero)) { + // domain error + return nan; } if ((x < one) || (x < a)) { return (one - igamma_impl::run(a, x)); } - ax = a * ::log(x) - x - lgamma_impl::run(a); - if( ax < -maxlog ) { // underflow + using std::isinf; + if ((isinf)(x)) return zero; + + /* Compute x**a * exp(-x) / gamma(a) */ + using std::log; + ax = a * log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { // underflow return zero; } - ax = ::exp(ax); + using std::exp; + ax = exp(ax); // continued fraction y = one - a; @@ -516,35 +561,36 @@ struct igammac_impl { qkm2 = x; pkm1 = x + one; qkm1 = z * x; - ans = pkm1/qkm1; + ans = pkm1 / qkm1; + using std::abs; do { c += one; y += one; z += two; yc = y * c; - pk = pkm1 * z - pkm2 * yc; - qk = qkm1 * z - qkm2 * yc; - if( qk != zero ) { - r = pk/qk; - t = ::abs( (ans - r)/r ); + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = abs((ans - r) / r); ans = r; - } else { + } else { t = one; } pkm2 = pkm1; pkm1 = pk; qkm2 = qkm1; qkm1 = qk; - if (::abs(pk) > big) { - pkm2 *= machep; - pkm1 *= machep; - qkm2 *= machep; - qkm1 *= machep; + if (abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; } - } while( t > machep ); + } while (t > machep); - return ( ans * ax ); + return (ans * ax); } }; @@ -639,26 +685,31 @@ struct igamma_impl { */ const Scalar zero = 0; const Scalar one = 1; - const Scalar machep = NumTraits::epsilon(); + const Scalar machep = igamma_helper::machep(); const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar nan = NumTraits::quiet_NaN(); double ans, ax, c, r; - if( (x <= zero) || ( a <= zero) ) { - return zero; + if (x == zero) return zero; + + if ((x < zero) || ( a <= zero)) { // domain error + return nan; } - if( (x > one) && (x > a ) ) { - return (one - igammac_impl::run(a,x)); + if ((x > one) && (x > a)) { + return (one - igammac_impl::run(a, x)); } /* Compute x**a * exp(-x) / gamma(a) */ - ax = a * ::log(x) - x - lgamma_impl::run(a); - if( ax < -maxlog ) { + using std::log; + ax = a * log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { // underflow return zero; } - ax = ::exp(ax); + using std::exp; + ax = exp(ax); /* power series */ r = a; @@ -669,9 +720,9 @@ struct igamma_impl { r += one; c *= x/r; ans += c; - } while( c/ans > machep ); + } while (c/ans > machep); - return( ans * ax/a ); + return (ans * ax / a); } }; diff --git a/test/array.cpp b/test/array.cpp index a37874cc2..c61bfc8ed 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -331,41 +331,45 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); - Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; - Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; // location i*6+j corresponds to a_s[i], x_s[j]. Scalar nan = std::numeric_limits::quiet_NaN(); - Scalar igamma_s[][6] = { - {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, - {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, - 9.999500016666262e-05, 1.0}, - {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, - 7.522076445089201e-07, 1.0}, - {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, - 4.166333347221828e-18, 1.0}, - {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, - 0.9991370418689945, 1.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}}; - Scalar igammac_s[][6] = { - {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, - {1.0, 0.36787944117144233, 0.22313016014842982, - 0.018315638888734182, 0.9999000049998333, 0.0}, - {1.0, 0.5724067044708798, 0.3916251762710878, - 0.04601170568923136, 0.9999992477923555, 0.0}, - {1.0, 0.9810118431238462, 0.9343575456215499, - 0.4334701203667089, 1.0, 0.0}, - {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, - 3.7801620118431334e-07, 0.0008629581310054535, 0.0}, - {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}}; + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { - //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl; - //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " << - //igammac_s[i][j] << std::endl; - std::cout << a_s[i] << ", " << x_s[j] << std::endl; - VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); - VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + } + + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } } } } -- cgit v1.2.3 From 23aed8f2e4c1c22280d4f13e6e4f89a622bd096a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 5 Mar 2016 08:04:45 -0800 Subject: Use EIGEN_PI instead of redefining our own constant PI --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index ece2695ee..7086a426d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -225,9 +225,9 @@ struct TensorEvaluator, D // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 pos_j_base_powered[0] = ComplexScalar(1, 0); if (line_len > 1) { - const RealScalar PI(3.14159265358979323846); + const RealScalar pi_over_len(EIGEN_PI / line_len); const ComplexScalar pos_j_base = ComplexScalar( - std::cos(PI / line_len), std::sin(PI / line_len)); + std::cos(pi_over_len), std::sin(pi_over_len)); pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; -- cgit v1.2.3 From 57b263c5b9da8699386ce2c046c14f12e6c59533 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 5 Mar 2016 08:35:26 -0800 Subject: Avoid using initializer lists in test since not all version of msvc support them --- unsupported/test/cxx11_tensor_of_complex.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index 25e51143e..e9d1b2d3c 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -83,7 +83,9 @@ static void test_contractions() // This contraction should be equivalent to a regular matrix multiplication typedef Tensor::DimensionPair DimPair; - Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); t_result = t_left.contract(t_right, dims); m_result = m_left * m_right; for (int i = 0; i < t_result.dimensions().TotalSize(); i++) { -- cgit v1.2.3 From 6093eb9ff564ded564fb48fdd01288140a9e3be6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 5 Mar 2016 10:37:11 -0800 Subject: Don't test our 128bit emulation code when compiling with msvc --- unsupported/test/cxx11_tensor_uint128.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index c6766c6c6..e8ecc5bca 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -11,11 +11,7 @@ #include -#if EIGEN_COMP_MSVC -typedef __uint128 uint128_t; -#else typedef __uint128_t uint128_t; -#endif using Eigen::internal::TensorUInt128; @@ -142,10 +138,15 @@ void test_misc2() { void test_cxx11_tensor_uint128() { +#if EIGEN_COMP_MSVC + // Skip the test on compilers that don't support 128bit integers natively + return; +#else CALL_SUBTEST_1(test_add()); CALL_SUBTEST_2(test_sub()); CALL_SUBTEST_3(test_mul()); CALL_SUBTEST_4(test_div()); CALL_SUBTEST_5(test_misc1()); CALL_SUBTEST_6(test_misc2()); +#endif } -- cgit v1.2.3 From 05bbca079a1c21fe3289d625a99200621606a0f6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 5 Mar 2016 10:52:08 -0800 Subject: Turn on some of the cxx11 features when compiling with visual studio 2015 --- Eigen/src/Core/util/Macros.h | 2 +- unsupported/Eigen/CXX11/Core | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index d196123c6..dbfc9bd37 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -360,7 +360,7 @@ #endif // Does the compiler support variadic templates? -#if __cplusplus > 199711L +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 // Disable the use of variadic templates when compiling with nvcc on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #if !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index c8dcf7c16..e3e2cb60c 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -35,11 +35,13 @@ #include "src/Core/util/EmulateArray.h" // Emulate the cxx11 functionality that we need if the compiler doesn't support it. -#if __cplusplus <= 199711L -#include "src/Core/util/EmulateCXX11Meta.h" -#else +// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it +// supports enough of the standard for our needs +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 #include "src/Core/util/CXX11Workarounds.h" #include "src/Core/util/CXX11Meta.h" +#else +#include "src/Core/util/EmulateCXX11Meta.h" #endif #include -- cgit v1.2.3 From 9a54c3e32bfefc5f7ffb998e5971ba80fc7a52ad Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 6 Mar 2016 09:38:56 -0800 Subject: Don't warn that msvc 2015 isn't c++11 compliant just because it doesn't claim to be. --- unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index b1528aa66..fe4d22803 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -29,8 +29,10 @@ /* Check that the compiler at least claims to support C++11. It might not be sufficient * because the compiler may not implement it correctly, but at least we'll know. + * On the other hand, visual studio still doesn't claim to support C++11 although it's + * compliant enugh for our purpose. */ -#if __cplusplus <= 199711L +#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900) #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) #pragma GCC diagnostic error "-Wfatal-errors" #endif -- cgit v1.2.3 From 5238e03fe1b34f2e28fbf1321ee48621f0f9363c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 6 Mar 2016 21:59:40 -0800 Subject: Don't try to compile the uint128 test with compilers that don't support uint127 --- unsupported/test/cxx11_tensor_uint128.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index e8ecc5bca..2cbc45716 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -11,8 +11,15 @@ #include + +#if EIGEN_COMP_MSVC +#define EIGEN_NO_INT128 +#else typedef __uint128_t uint128_t; +#endif +// Only run the test on compilers that support 128bit integers natively +#ifndef EIGEN_NO_INT128 using Eigen::internal::TensorUInt128; using Eigen::internal::static_val; @@ -134,11 +141,12 @@ void test_misc2() { } } } +#endif void test_cxx11_tensor_uint128() { -#if EIGEN_COMP_MSVC +#ifdef EIGEN_NO_INT128 // Skip the test on compilers that don't support 128bit integers natively return; #else -- cgit v1.2.3 From 9f5740cbc108adf477521f025605891ebeda61c6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 6 Mar 2016 22:03:18 -0800 Subject: Added missing include --- unsupported/test/cxx11_meta.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index 4f45e1dd3..f9179bbe6 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -9,6 +9,7 @@ #include "main.h" +#include #include using Eigen::internal::is_same; -- cgit v1.2.3 From e5f25622e20563d35d51de356379bdfd257f1d67 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Mar 2016 09:04:27 -0800 Subject: Added a test to validate the behavior of some of the tensor syntactic sugar. --- unsupported/test/cxx11_tensor_sugar.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index adac472cf..a03f75cfe 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -32,7 +32,30 @@ static void test_comparison_sugar() { #undef TEST_TENSOR_EQUAL } + +static void test_scalar_sugar() { + Tensor A(6, 7, 5); + Tensor B(6, 7, 5); + A.setRandom(); + B.setRandom(); + + const float alpha = 0.43f; + const float beta = 0.21f; + + Tensor R = A * A.constant(alpha) + B * B.constant(beta); + Tensor S = A * alpha + B * beta; + + // TODO: add enough syntactic sugar to support this + // Tensor T = alpha * A + beta * B; + + for (int i = 0; i < 6*7*5; ++i) { + VERIFY_IS_APPROX(R(i), S(i)); + } +} + + void test_cxx11_tensor_sugar() { CALL_SUBTEST(test_comparison_sugar()); + CALL_SUBTEST(test_scalar_sugar()); } -- cgit v1.2.3 From 5707004d6b947c202085c3ead889e277264ea36a Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 7 Mar 2016 14:08:56 -0800 Subject: Fix Eigen's building of sharded tests that use CUDA & more igamma/igammac bugfixes. 0. Prior to this PR, not a single sharded CUDA test was actually being *run*. Fixed that. GPU tests are still failing for igamma/igammac. 1. Add calls for igamma/igammac to TensorBase 2. Fix up CUDA-specific calls of igamma/igammac 3. Add unit tests for digamma, igamma, igammac in CUDA. --- Eigen/src/Core/GenericPacketMath.h | 4 +- Eigen/src/Core/arch/CUDA/MathFunctions.h | 34 +++- Eigen/src/Core/arch/CUDA/PacketMath.h | 1 - cmake/EigenTesting.cmake | 7 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 15 ++ unsupported/test/CMakeLists.txt | 16 +- unsupported/test/cxx11_tensor_cuda.cu | 205 ++++++++++++++++++++++++ 7 files changed, 268 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ead0253df..802def51d 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -460,11 +460,11 @@ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } /*************************************************************************** diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 6e84d3af8..6822700f8 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -116,24 +116,42 @@ double2 perfc(const double2& a) return make_double2(erfc(a.x), erfc(a.y)); } + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma(const float4& a, const float4& x) { - using numext::pigamma; + using numext::igamma; return make_float4( - pigamma(a.x, x.x), - pigamma(a.y, x.y), - pigamma(a.z, x.z), - pigamma(a.w, x.w)); + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double2 pigammac(const double2& a, const double& x) +double2 pigamma(const double2& a, const double2& x) { - using numext::pigammac; - return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y)); + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d2563030b..25d964600 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -284,7 +284,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 5022397a7..5ca800cfe 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,12 +19,15 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - cuda_add_executable(${targetname} ${filename}) + if (${ARGC} GREATER 2) + cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + else() + cuda_add_executable(${targetname} ${filename}) + endif() else() add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") add_dependencies(eigen2_buildtests ${targetname}) else() diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 4dea1d3a0..aa67b9811 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -315,12 +315,27 @@ class TensorBase operator==(const OtherDerived& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + // comparisons and tests for Scalars EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c202cf0e4..17f83915b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -1,3 +1,17 @@ +# generate split test header file only if it does not yet exist +# in order to prevent a rebuild everytime cmake is configured +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") + foreach(i RANGE 1 999) + file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h + "#ifdef EIGEN_TEST_PART_${i}\n" + "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" + "#else\n" + "#define CALL_SUBTEST_${i}(FUNC)\n" + "#endif\n\n" + ) + endforeach() +endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported") add_custom_target(BuildUnsupported) @@ -158,7 +172,7 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND) - set(CUDA_PROPAGATE_HOST_FLAGS OFF) +# set(CUDA_PROPAGATE_HOST_FLAGS OFF) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 58da21d3b..348271e4b 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -574,6 +574,195 @@ void test_cuda_lgamma(const Scalar stddev) cudaFree(d_out); } +template +void test_cuda_digamma() +{ + Tensor in(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits::infinity(); + expected_out(6) = std::numeric_limits::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } +} + +template +void test_cuda_igamma() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + Tensor expected_out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[i]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + printf("got: %g\n", out(i, j)); + //VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } +} + +template +void test_cuda_igammac() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + Tensor expected_out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[i]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + printf("got: %g\n", out(i, j)); + //VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } +} + template void test_cuda_erf(const Scalar stddev) { @@ -667,30 +856,46 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_2d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); CALL_SUBTEST_4(test_cuda_erf(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); // CALL_SUBTEST(test_cuda_erfc(100.0f)); CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01f)); CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); CALL_SUBTEST_4(test_cuda_lgamma(100.0)); CALL_SUBTEST_4(test_cuda_lgamma(0.01)); CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0)); CALL_SUBTEST_4(test_cuda_erf(100.0)); CALL_SUBTEST_4(test_cuda_erf(0.01)); CALL_SUBTEST_4(test_cuda_erf(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(1.0)); // CALL_SUBTEST(test_cuda_erfc(100.0)); CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01)); CALL_SUBTEST_4(test_cuda_erfc(0.001)); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); } -- cgit v1.2.3 From 7f87cc3a3b397ad5b940289bfd8369eff517efae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Mar 2016 14:31:27 -0800 Subject: Fix a couple of typos in the code. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index df15c6204..fd9919829 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); -- cgit v1.2.3 From 769685e74e92a66badc599b72d7034cea907a798 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Mar 2016 14:45:37 -0800 Subject: Added the ability to pad a tensor using a non-zero value --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 7 +++- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 45 ++++++++++++---------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 5679e58cf..66772a3ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -643,7 +643,12 @@ class TensorBase template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorPaddingOp pad(const PaddingDimensions& padding) const { - return TensorPaddingOp(derived(), padding); + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index c3f25f0df..eaaf4dc86 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -16,7 +16,7 @@ namespace Eigen { * \ingroup CXX11_Tensor_Module * * \brief Tensor padding class. - * At the moment only 0-padding is supported. + * At the moment only padding with a constant value is supported. * */ namespace internal { @@ -63,11 +63,13 @@ class TensorPaddingOp : public TensorBase::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims) - : m_xpr(expr), m_padding_dims(padding_dims) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} EIGEN_DEVICE_FUNC const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } EIGEN_DEVICE_FUNC const typename internal::remove_all::type& @@ -76,6 +78,7 @@ class TensorPaddingOp : public TensorBase, Device }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -151,27 +154,27 @@ struct TensorEvaluator, Device for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (index - m_padding[0].first); } else { for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i+1]; } if (index < m_padding[NumDims-1].first || index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (index - m_padding[NumDims-1].first); } @@ -194,14 +197,14 @@ struct TensorEvaluator, Device { const Index idx = coords[0]; if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex = idx - m_padding[0].first; } for (int i = 1; i < NumDims; ++i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -209,14 +212,14 @@ struct TensorEvaluator, Device { const Index idx = coords[NumDims-1]; if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex = idx - m_padding[NumDims-1].first; } for (int i = NumDims - 2; i >= 0; --i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return internal::scalar_cast_op()(0); + return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -245,11 +248,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -271,11 +274,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -304,11 +307,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -330,11 +333,11 @@ struct TensorEvaluator, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1(internal::scalar_cast_op()(0)); + return internal::pset1(m_paddingValue); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -361,6 +364,8 @@ struct TensorEvaluator, Device array m_inputStrides; TensorEvaluator m_impl; PaddingDimensions m_padding; + + Scalar m_paddingValue; }; -- cgit v1.2.3 From 0bb5de05a131e955190e9a408bdc0e00b1929745 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 7 Mar 2016 15:35:09 -0800 Subject: Finishing touches on igamma/igammac for GPU. Tests now pass. --- Eigen/src/Core/SpecialFunctions.h | 22 ++++++++++++---------- unsupported/test/cxx11_tensor_cuda.cu | 12 ++++-------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 4a61325d4..567c02c61 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -520,14 +520,16 @@ struct igammac_impl { Copyright 1985, 1987, 1992 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ + using std::log; const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; const Scalar machep = igamma_helper::machep(); - const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar maxlog = log(NumTraits::highest()); const Scalar big = igamma_helper::big(); const Scalar biginv = 1 / big; const Scalar nan = NumTraits::quiet_NaN(); + const Scalar inf = NumTraits::infinity(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; @@ -541,11 +543,9 @@ struct igammac_impl { return (one - igamma_impl::run(a, x)); } - using std::isinf; - if ((isinf)(x)) return zero; + if (x == inf) return zero; // std::isinf crashes on CUDA /* Compute x**a * exp(-x) / gamma(a) */ - using std::log; ax = a * log(x) - x - lgamma_impl::run(a); if (ax < -maxlog) { // underflow return zero; @@ -564,7 +564,7 @@ struct igammac_impl { ans = pkm1 / qkm1; using std::abs; - do { + while (true) { c += one; y += one; z += two; @@ -588,7 +588,8 @@ struct igammac_impl { qkm2 *= biginv; qkm1 *= biginv; } - } while (t > machep); + if (t <= machep) break; + } return (ans * ax); } @@ -683,10 +684,11 @@ struct igamma_impl { * k=0 | (a+k+1) * */ + using std::log; const Scalar zero = 0; const Scalar one = 1; const Scalar machep = igamma_helper::machep(); - const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar maxlog = log(NumTraits::highest()); const Scalar nan = NumTraits::quiet_NaN(); double ans, ax, c, r; @@ -702,7 +704,6 @@ struct igamma_impl { } /* Compute x**a * exp(-x) / gamma(a) */ - using std::log; ax = a * log(x) - x - lgamma_impl::run(a); if (ax < -maxlog) { // underflow @@ -716,11 +717,12 @@ struct igamma_impl { c = one; ans = one; - do { + while (true) { r += one; c *= x/r; ans += c; - } while (c/ans > machep); + if (c/ans <= machep) break; + } return (ans * ax / a); } diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 348271e4b..1964d9e07 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -632,7 +632,6 @@ void test_cuda_igamma() Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); - Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -641,7 +640,7 @@ void test_cuda_igamma() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { a(i, j) = a_s[i]; - x(i, j) = x_s[i]; + x(i, j) = x_s[j]; } } @@ -686,8 +685,7 @@ void test_cuda_igamma() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { if ((std::isnan)(igamma_s[i][j])) { - printf("got: %g\n", out(i, j)); - //VERIFY((std::isnan)(out(i, j))); + VERIFY((std::isnan)(out(i, j))); } else { VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); } @@ -701,7 +699,6 @@ void test_cuda_igammac() Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); - Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -710,7 +707,7 @@ void test_cuda_igammac() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { a(i, j) = a_s[i]; - x(i, j) = x_s[i]; + x(i, j) = x_s[j]; } } @@ -754,8 +751,7 @@ void test_cuda_igammac() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { if ((std::isnan)(igammac_s[i][j])) { - printf("got: %g\n", out(i, j)); - //VERIFY((std::isnan)(out(i, j))); + VERIFY((std::isnan)(out(i, j))); } else { VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); } -- cgit v1.2.3 From 3b614a235823322fd2ad9e367e36384bd353f9f8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Mar 2016 17:53:28 -0800 Subject: Use NumTraits::highest() and NumTraits::lowest() instead of the std::numeric_limits to make the tensor min and max functors more CUDA friendly. --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 528909688..b24f06df8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -149,11 +149,11 @@ template struct MaxReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return -(std::numeric_limits::max)(); + return Eigen::NumTraits::lowest(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(-(std::numeric_limits::max)()); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; @@ -182,11 +182,11 @@ template struct MinReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return (std::numeric_limits::max)(); + return Eigen::NumTraits::highest(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1((std::numeric_limits::max)()); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; @@ -722,6 +722,7 @@ template <> class NormalRandomGenerator > { template class NormalRandomGenerator { public: + static const bool PacketAccess = false; NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {} private: -- cgit v1.2.3 From e09eb835dbf15b7bd0de9dc8786080a2eb377fdb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 12:07:33 -0800 Subject: Decoupled the packet type definition from the definition of the tensor ops. All the vectorization is now defined in the tensor evaluators. This will make it possible to relialably support devices with different packet types in the same compilation unit. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 6 +---- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 12 +++------- .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 3 +-- .../CXX11/src/Tensor/TensorContractionMapper.h | 4 ++-- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 3 +-- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 7 ++---- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 8 ++----- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 12 ++-------- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 8 ++----- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 26 ++++++++++------------ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 13 ----------- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 -- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 8 ++----- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 6 ++--- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 5 +---- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 2 -- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 ++++-------- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 5 +---- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 5 +---- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 5 +---- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 7 ++---- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 7 ++---- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 7 ++---- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 8 +++---- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 5 +---- 31 files changed, 55 insertions(+), 161 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 17e485f0a..759dede3f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -69,14 +69,11 @@ class Tensor : public TensorBase::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; - typedef typename Base::PacketReturnType PacketReturnType; enum { IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), - PacketAccess = (internal::packet_traits::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 10fac0cc5..199d2ce41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -25,7 +25,6 @@ template struct traits > { typedef typename LhsXprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; typedef typename promote_index_type::Index, typename traits::Index>::type Index; @@ -62,10 +61,8 @@ class TensorAssignOp : public TensorBase { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename LhsXprType::CoeffReturnType CoeffReturnType; - typedef typename LhsXprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -110,7 +107,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index efca7cd79..b6e6db12a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -25,7 +25,6 @@ struct traits > : public traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -70,10 +69,8 @@ class TensorBroadcastingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -144,7 +141,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index a209e885b..ba8111316 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -26,7 +26,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -80,10 +79,8 @@ class TensorChippingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -184,7 +181,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index f57d2bb7d..122306e5c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -26,7 +26,6 @@ struct traits > // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename promote_storage_type::ret Scalar; - typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -60,14 +59,11 @@ class TensorConcatenationOp : public TensorBase::Scalar Scalar; - typedef typename internal::traits::Packet Packet; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::nested::type Nested; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) @@ -120,7 +116,7 @@ struct TensorEvaluator Dimensions; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1adb68894..75bd23412 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -27,7 +27,6 @@ struct traits > // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename internal::promote_storage_type::ret Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -76,11 +75,8 @@ class TensorContractionOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -118,10 +114,9 @@ struct TensorContractionEvaluatorBase typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; enum { IsAligned = true, @@ -434,7 +429,7 @@ struct TensorContractionEvaluatorBase template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); + return internal::ploadt(m_result + index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } @@ -478,10 +473,9 @@ struct TensorEvaluator XprType; typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; enum { Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index f5b539c7e..a4a06ab5f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1213,10 +1213,9 @@ struct TensorEvaluator XprType; typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; enum { Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 392aa6d37..63f40b2b6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -230,8 +230,8 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::type Packet; - typedef typename packet_traits::half HalfPacket; + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; template EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 51a3b9490..41bb704d5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -65,10 +65,9 @@ struct TensorEvaluator XprType; typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; enum { Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 4e87813a9..f2dee3ee8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -25,7 +25,6 @@ struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef TargetType Scalar; - typedef typename packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; typedef typename traits::Index Index; typedef typename XprType::Nested Nested; @@ -146,12 +145,10 @@ class TensorConversionOp : public TensorBase::Scalar Scalar; - typedef typename internal::traits::Packet Packet; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::nested::type Nested; typedef Scalar CoeffReturnType; - typedef Packet PacketReturnType; typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) @@ -190,8 +187,8 @@ struct TensorEvaluator, Device> typedef TargetType Scalar; typedef TargetType CoeffReturnType; typedef typename internal::remove_all::Scalar>::type SrcType; - typedef typename internal::traits::Packet PacketReturnType; - typedef typename internal::packet_traits::type PacketSourceType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; enum { IsAligned = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 67c797802..4fe1fb943 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -221,7 +221,6 @@ struct traits > // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename promote_storage_type::ret Scalar; - typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -259,12 +258,9 @@ class TensorConvolutionOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -373,7 +369,7 @@ struct TensorEvaluator::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -775,7 +771,7 @@ struct TensorEvaluator::type PacketReturnType; typedef typename InputArgType::Scalar Scalar; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0f8a98caf..b58e513b4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -24,7 +24,6 @@ template struct traits > { typedef typename XprType::Scalar Scalar; - typedef typename packet_traits::type Packet; typedef typename XprType::StorageKind StorageKind; typedef typename XprType::Index Index; typedef typename XprType::Nested Nested; @@ -54,10 +53,8 @@ class TensorCustomUnaryOp : public TensorBase::Scalar Scalar; - typedef typename internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; @@ -105,7 +102,7 @@ struct TensorEvaluator, Devi } typedef typename internal::remove_const::type CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -167,11 +164,8 @@ struct traits > { typedef typename internal::promote_storage_type::ret Scalar; - typedef typename packet_traits::type Packet; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -205,10 +199,8 @@ class TensorCustomBinaryOp : public TensorBase::Scalar Scalar; - typedef typename internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::traits::CoeffReturnType CoeffReturnType; - typedef typename internal::traits::PacketReturnType PacketReturnType; typedef typename internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; @@ -261,7 +253,7 @@ struct TensorEvaluator::type CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index bd83d5de8..5d73d62d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -26,7 +26,6 @@ struct traits > // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -61,10 +60,8 @@ class TensorEvalToOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -90,7 +87,6 @@ struct TensorEvaluator, Device> { typedef TensorEvalToOp XprType; typedef typename ArgType::Scalar Scalar; - typedef typename ArgType::Packet Packet; typedef typename TensorEvaluator::Dimensions Dimensions; enum { @@ -110,7 +106,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } @@ -138,7 +134,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_buffer + index); + return internal::ploadt(m_buffer + index); } EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f726585b1..d8afdcd1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -29,9 +29,8 @@ struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; - typedef typename Derived::Packet Packet; typedef typename Derived::Scalar CoeffReturnType; - typedef typename Derived::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; // NumDimensions is -1 for variable dim tensors @@ -40,7 +39,7 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, - PacketAccess = Derived::PacketAccess, + PacketAccess = (internal::unpacket_traits::size > 1), Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -75,13 +74,13 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_data + index); + return internal::ploadt(m_data + index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const Packet& x) + void writePacket(Index index, const PacketReturnType& x) { - return internal::pstoret(m_data + index, x); + return internal::pstoret(m_data + index, x); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { @@ -135,9 +134,8 @@ struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; - typedef typename Derived::Packet Packet; typedef typename Derived::Scalar CoeffReturnType; - typedef typename Derived::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; // NumDimensions is -1 for variable dim tensors @@ -146,7 +144,7 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, - PacketAccess = Derived::PacketAccess, + PacketAccess = (internal::unpacket_traits::size > 1), Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -176,7 +174,7 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt_ro(m_data + index); + return internal::ploadt_ro(m_data + index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { @@ -220,7 +218,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename internal::traits::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -271,7 +269,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename internal::traits::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -331,7 +329,7 @@ struct TensorEvaluator::Scalar CoeffReturnType; - typedef typename internal::traits::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -399,7 +397,7 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename internal::traits::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 194c68929..49d849e23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -32,7 +32,6 @@ template struct traits > : traits { - typedef typename XprType::Packet Packet; typedef traits XprTraits; typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; @@ -54,10 +53,8 @@ class TensorCwiseNullaryOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef TensorCwiseNullaryOp Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -88,7 +85,6 @@ struct traits > // current Scalar/Packet to see if the intent is Input or Output. typedef typename result_of::type Scalar; typedef traits XprTraits; - typedef typename internal::packet_traits::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; @@ -118,10 +114,8 @@ class TensorCwiseUnaryOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef Scalar CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -155,7 +149,6 @@ struct traits > BinaryOp(typename LhsXprType::Scalar, typename RhsXprType::Scalar)>::type Scalar; typedef traits XprTraits; - typedef typename internal::packet_traits::type Packet; typedef typename promote_storage_type< typename traits::StorageKind, typename traits::StorageKind>::ret StorageKind; @@ -197,10 +190,8 @@ class TensorCwiseBinaryOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef Scalar CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -234,7 +225,6 @@ struct traits > { typedef typename traits::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -266,12 +256,9 @@ class TensorSelectOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 70282dd83..9c0ed43b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -33,7 +33,6 @@ class TensorFixedSize : public TensorBase::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; @@ -41,7 +40,6 @@ class TensorFixedSize : public TensorBase0), - PacketAccess = (internal::packet_traits::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 58b864787..14f480901 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,7 +26,6 @@ struct traits > // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; typedef typename traits::Index Index; typedef typename XprType::Nested Nested; @@ -60,10 +59,8 @@ class TensorForcedEvalOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -85,7 +82,6 @@ struct TensorEvaluator, Device> { typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; - typedef typename ArgType::Packet Packet; typedef typename TensorEvaluator::Dimensions Dimensions; enum { @@ -101,7 +97,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } @@ -133,7 +129,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_buffer + index); + return internal::ploadt(m_buffer + index); } EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 96f74b992..4c11bca07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -25,7 +25,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorGeneratorOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -118,7 +115,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index bc6021c9e..0008f9890 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -32,7 +32,6 @@ struct traits > : public traits { typedef typename internal::remove_const::type Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -60,10 +59,8 @@ class TensorImagePatchOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -311,7 +308,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 2798956ae..368e6f685 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -25,7 +25,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -53,10 +52,8 @@ class TensorInflationOp : public TensorBase, { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -128,7 +125,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index a37516974..c5e29fe74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -40,7 +40,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -70,10 +69,8 @@ class TensorLayoutSwapOp : public TensorBase, WriteA { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -136,7 +133,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 4a199cdd8..9ebd9172b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -28,7 +28,6 @@ template class TensorMap : public Tensor typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; @@ -47,7 +46,6 @@ template class TensorMap : public Tensor enum { IsAligned = ((int(Options_)&Aligned)==Aligned), - PacketAccess = (internal::packet_traits::size > 1), Layout = PlainObjectType::Layout, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e867e450e..afde7b3d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -25,7 +25,6 @@ struct traits > : public traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -124,7 +121,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -181,7 +178,7 @@ template typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { @@ -208,7 +205,6 @@ struct traits > : public traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -238,10 +234,8 @@ class TensorSlicingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -361,7 +355,7 @@ struct TensorEvaluator, Devi typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -549,7 +543,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index eaaf4dc86..a595a0175 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -25,7 +25,6 @@ struct traits > : public traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorPaddingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -134,7 +131,7 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 57b716fd6..0bf460f4e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -25,7 +25,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorPatchOp : public TensorBase, ReadOn { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -141,7 +138,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index d01a63ccb..4f2801e53 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -26,7 +26,6 @@ struct traits > { typedef traits XprTraits; typedef typename XprTraits::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -381,10 +380,8 @@ template class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -509,7 +506,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::remove_const::type PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 57197d060..bc92d9e6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -125,7 +125,6 @@ template class TensorRef : public TensorBase::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; typedef Scalar* PointerType; @@ -358,9 +357,8 @@ struct TensorEvaluator, Device> { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; - typedef typename Derived::Packet Packet; typedef typename Derived::Scalar CoeffReturnType; - typedef typename Derived::Packet PacketReturnType; + typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; enum { @@ -404,9 +402,8 @@ struct TensorEvaluator, Device> : public TensorEvaluator::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; typedef TensorEvaluator, Device> Base; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 846f81e0f..96d92038c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -25,7 +25,6 @@ struct traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorReverseOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; @@ -140,7 +137,7 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -248,7 +245,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index c4adb7d4c..c19833ea5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -25,7 +25,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorShufflingOp : public TensorBase { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -149,7 +146,7 @@ struct TensorEvaluator, Device> } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -234,7 +231,7 @@ struct TensorEvaluator, Device> { } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2c2eb6515..085f8fd3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -25,7 +25,6 @@ struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorStridingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -147,7 +144,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -267,7 +264,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 04f4f8ffc..5bdfbad46 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -27,7 +27,6 @@ struct traits > : public traits { typedef typename internal::remove_const::type Scalar; typedef traits XprTraits; - typedef typename packet_traits::type Packet; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; @@ -55,10 +54,8 @@ class TensorVolumePatchOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -340,7 +337,7 @@ struct TensorEvaluator, D } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } -- cgit v1.2.3 From 8768c063f5607f27b899102abf472815981cf788 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 12:26:49 -0800 Subject: Fixed the tensor chipping code. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 2 -- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 8 ++++---- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 2 +- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 66772a3ad..c854afd2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -31,7 +31,6 @@ class TensorBase typedef typename DerivedTraits::Scalar Scalar; typedef typename DerivedTraits::Index Index; typedef typename internal::remove_const::type CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; static const int NumDimensions = DerivedTraits::NumDimensions; // Generic nullary operation support. @@ -706,7 +705,6 @@ class TensorBase : public TensorBase::type PacketReturnType; static const int NumDimensions = DerivedTraits::NumDimensions; template friend class Tensor; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index ba8111316..5023371ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -310,7 +310,7 @@ struct TensorEvaluator, Device> { } typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 75bd23412..18b20b2dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -120,7 +120,7 @@ struct TensorContractionEvaluatorBase enum { IsAligned = true, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (internal::unpacket_traits::size > 1), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true @@ -381,8 +381,8 @@ struct TensorContractionEvaluatorBase typedef typename internal::remove_const::type RhsScalar; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; - const Index lhs_packet_size = internal::packet_traits::size; - const Index rhs_packet_size = internal::packet_traits::size; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; typedef internal::TensorContractionInputMapper LeftEvaluator; typedef TensorEvaluator RightEvaluator; - const Index lhs_packet_size = internal::packet_traits::size; - const Index rhs_packet_size = internal::packet_traits::size; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; typedef internal::TensorContractionInputMapper::type Packet; + typedef typename Tensor::PacketReturnType Packet; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload::type>(data); + return pload(data); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { @@ -334,8 +334,8 @@ template class TensorContractionSubMapper { public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; typedef BaseTensorContractionMapper ParentMapper; typedef TensorContractionSubMapper Self; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 41bb704d5..02b3c6dea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -135,8 +135,8 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); - const int lhs_packet_size = internal::packet_traits::size; - const int rhs_packet_size = internal::packet_traits::size; + const int lhs_packet_size = internal::unpacket_traits::size; + const int rhs_packet_size = internal::unpacket_traits::size; typedef internal::TensorContractionInputMapper, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_functor.template packetOp(index); + return m_functor.template packetOp(index); } EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } -- cgit v1.2.3 From 551ff11d0d1ad8025de77166ea2ec86874cb717d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 12:28:10 -0800 Subject: Fixed the tensor layout swapping code --- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index c5e29fe74..9b85914ff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -187,7 +187,7 @@ template typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename PacketType::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { -- cgit v1.2.3 From a81b88bef7d539e4050358d4c0e17c61c6ed3141 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 12:30:19 -0800 Subject: Fixed the tensor concatenation code --- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 122306e5c..7738f18fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -296,7 +296,7 @@ template::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { -- cgit v1.2.3 From 5a427a94a9c04f5cc32c185c9eebe10e40956d5e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 13:28:06 -0800 Subject: Fixed the tensor generator code --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 4c11bca07..e4154bd0b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -85,10 +85,11 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size::value; typedef typename XprType::Scalar Scalar; - + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (internal::unpacket_traits::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -114,9 +115,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { -- cgit v1.2.3 From 6d6413f76832a094d0835770af2adfaabba24738 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 16:02:00 -0800 Subject: Simplified the full reduction code --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 143 ++++++++++----------- 1 file changed, 71 insertions(+), 72 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 4f2801e53..875155243 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -221,121 +221,120 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers -template +template struct FullReducerShard { - static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { - - shard->saccum = reducer.initialize(); - for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); - } - } - - typename Eval::CoeffReturnType saccum; -}; - -template -struct FullReducerShard { - static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { - - const int packetSize = internal::unpacket_traits::size; - const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - - shard->paccum = reducer.template initializePacket(); - for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(eval.m_impl.template packet(firstIndex + j), &shard->paccum); - } - shard->saccum = reducer.initialize(); - for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); - } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce( + self, firstIndex, numValuesToReduce, reducer); } - - typename Eval::PacketReturnType paccum; - typename Eval::CoeffReturnType saccum; }; - template struct FullReducer { static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits::size; // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); - const Index blocksize = std::floor(static_cast(num_coeffs)/device.numThreads()); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); - - std::vector results; - results.reserve(numblocks); - std::vector > shards; - shards.resize(numblocks); - for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard::run, self, i*blocksize, blocksize, reducer, &shards[i])); - } - - FullReducerShard finalShard; - if (numblocks * blocksize < num_coeffs) { - FullReducerShard::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const int num_threads = device.numThreads(); + if (num_threads == 1) { + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; } else { - finalShard.saccum = reducer.initialize(); - } - - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + std::vector results; + results.reserve(numblocks); + std::vector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + results.push_back( + device.enqueue(&FullReducerShard::run, self, + i * blocksize, blocksize, reducer, &shards[i])); + } - for (Index i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i].saccum, &finalShard.saccum); + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + } else { + finalShard = reducer.initialize(); + } + for (Index i = 0; i < numblocks; ++i) { + wait_until_ready(results[i]); + delete results[i]; + } + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); } - *output = reducer.finalize(finalShard.saccum); } }; template struct FullReducer { static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits::size; // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); - const Index blocksize = std::floor(static_cast(num_coeffs)/device.numThreads()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const int num_threads = device.numThreads(); + if (num_threads == 1) { + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); std::vector results; results.reserve(numblocks); - std::vector > shards; - shards.resize(numblocks); + std::vector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard::run, self, i*blocksize, blocksize, reducer, &shards[i])); + results.push_back(device.enqueue(&FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i])); } - - FullReducerShard finalShard; + typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { - FullReducerShard::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); } else { - finalShard.paccum = reducer.template initializePacket(); - finalShard.saccum = reducer.initialize(); + finalShard = reducer.initialize(); } for (Index i = 0; i < numblocks; ++i) { wait_until_ready(results[i]); delete results[i]; } - for (Index i = 0; i < numblocks; ++i) { - reducer.reducePacket(shards[i].paccum, &finalShard.paccum); - reducer.reduce(shards[i].saccum, &finalShard.saccum); + reducer.reduce(shards[i], &finalShard); } - - *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum); + *output = reducer.finalize(finalShard); } }; + #endif -- cgit v1.2.3 From 46177c8d648a27d82d34cebed7e2b5bc59d441fc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 16:37:27 -0800 Subject: Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues. --- unsupported/Eigen/CXX11/Core | 1 + .../Eigen/CXX11/src/Core/util/MaxSizeVector.h | 130 +++++++++++++++++++++ .../CXX11/src/Tensor/TensorContractionThreadPool.h | 16 ++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +- 5 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index e3e2cb60c..946145f5a 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -33,6 +33,7 @@ #include #include "src/Core/util/EmulateArray.h" +#include "src/Core/util/MaxSizeVector.h" // Emulate the cxx11 functionality that we need if the compiler doesn't support it. // Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h new file mode 100644 index 000000000..551124bae --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h @@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FIXEDSIZEVECTOR_H +#define EIGEN_FIXEDSIZEVECTOR_H + +namespace Eigen { + +/** \class MaxSizeVector + * \ingroup Core + * + * \brief The MaxSizeVector class. + * + * The %MaxSizeVector provides a subset of std::vector functionality. + * + * The goal is to provide basic std::vector operations when using + * std::vector is not an option (e.g. on GPU or when compiling using + * FMA/AVX, as this can cause either compilation failures or illegal + * instruction failures). + * + * Beware: The constructors are not API compatible with these of + * std::vector. + */ +template +class MaxSizeVector { + public: + // Construct a new MaxSizeVector, reserve n elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n) + : reserve_(n), size_(0), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; } + } + + // Construct a new MaxSizeVector, reserve and resize to n. + // Copy the init value to all elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n, const T& init) + : reserve_(n), size_(n), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ~MaxSizeVector() { + for (size_t i = 0; i < size_; ++i) { + data_[i].~T(); + } + internal::aligned_free(data_); + } + + // Append new elements (up to reserved size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void push_back(const T& t) { + eigen_assert(size_ < reserve_); + data_[size_++] = t; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& operator[] (size_t i) const { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& operator[] (size_t i) { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& back() { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& back() const { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void pop_back() { + // NOTE: This does not destroy the value at the end the way + // std::vector's version of pop_back() does. That happens when + // the Vector is destroyed. + eigen_assert(size_ > 0); + size_--; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t size() const { return size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool empty() const { return size_ == 0; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* data() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* data() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* begin() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* end() { return data_ + size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* begin() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* end() const { return data_ + size_; } + + private: + size_t reserve_; + size_t size_; + T* data_; +}; + +} // namespace Eigen + +#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 02b3c6dea..9044454fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -28,7 +28,7 @@ struct packLhsArg { template struct packRhsAndKernelArg { - const std::vector* blockAs; + const MaxSizeVector* blockAs; RhsScalar* blockB; const RhsMapper& rhs; OutputMapper& output; @@ -46,8 +46,8 @@ struct packRhsAndKernelArg { const Index n_block_idx; const Index m_blocks; const Index n_blocks; - std::vector* kernel_notifications; - const std::vector* lhs_notifications; + MaxSizeVector* kernel_notifications; + const MaxSizeVector* lhs_notifications; const bool need_to_pack; }; @@ -202,8 +202,7 @@ struct TensorEvaluator blockAs; - blockAs.reserve(num_threads); + MaxSizeVector blockAs(num_threads); for (int i = 0; i < num_threads; i++) { blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); } @@ -212,18 +211,17 @@ struct TensorEvaluator blockBs; - blockBs.reserve(n_blocks); + MaxSizeVector blockBs(n_blocks); for (int i = 0; i < n_blocks; i++) { blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); } // lhs_notifications starts with all null Notifications - std::vector lhs_notifications(num_threads, nullptr); + MaxSizeVector lhs_notifications(num_threads, nullptr); // this should really be numBlockAs * n_blocks; const Index num_kernel_notifications = num_threads * n_blocks; - std::vector kernel_notifications(num_kernel_notifications, + MaxSizeVector kernel_notifications(num_kernel_notifications, nullptr); for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index fd9919829..54da77bcf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -127,8 +127,7 @@ class TensorExecutor const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - std::vector results; - results.reserve(numblocks); + MaxSizeVector results(numblocks); for (int i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 875155243..2d7fb80d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -256,9 +256,8 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector results; - results.reserve(numblocks); - std::vector shards(numblocks, reducer.initialize()); + MaxSizeVector results(numblocks); + MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back( device.enqueue(&FullReducerShard::run, self, @@ -308,9 +307,8 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector results; - results.reserve(numblocks); - std::vector shards(numblocks, reducer.initialize()); + MaxSizeVector results(numblocks); + MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&FullReducerShard::run, self, i * blocksize, blocksize, reducer, -- cgit v1.2.3 From 14f0fde51fb510cb5c16e7f7841fa0987054ac01 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Tue, 8 Mar 2016 17:17:44 -0800 Subject: Add certain functions to numext (log, exp, tan) because CUDA doesn't support std:: Use these in SpecialFunctions. --- Eigen/src/Core/MathFunctions.h | 62 +++++++ Eigen/src/Core/SpecialFunctions.h | 342 +++++++++++++++++++++++++++++++++++++- 2 files changed, 400 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 447f1b834..0be6de71f 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -946,6 +946,13 @@ T (floor)(const T& x) return floor(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float floor(const float &x) { return ::floorf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double floor(const double &x) { return ::floor(x); } + + template EIGEN_DEVICE_FUNC T (ceil)(const T& x) @@ -985,6 +992,61 @@ T sqrt(const T &x) return sqrt(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T log(const T &x) { + EIGEN_USING_STD_MATH(log); + return log(x); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float log(const float &x) { return ::logf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double log(const double &x) { return ::log(x); } + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tan(const T &x) { + EIGEN_USING_STD_MATH(tan); + return tan(x); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tan(const float &x) { return ::tanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tan(const double &x) { return ::tan(x); } + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T abs(const T &x) { + EIGEN_USING_STD_MATH(abs); + return abs(x); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const float &x) { return ::fabsf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const double &x) { return ::fabs(x); } + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T exp(const T &x) { + EIGEN_USING_STD_MATH(exp); + return exp(x); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float exp(const float &x) { return ::expf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double exp(const double &x) { return ::exp(x); } + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index b02ad9a1f..c01a88d18 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -283,7 +283,7 @@ struct digamma_impl { Scalar p, q, nz, s, w, y; bool negative; - const Scalar maxnum = numext::numeric_limits::infinity(); + const Scalar maxnum = NumTraits::infinity(); const Scalar m_pi = 3.14159265358979323846; negative = 0; @@ -296,7 +296,7 @@ struct digamma_impl { if (x <= zero) { negative = one; q = x; - p = ::floor(q); + p = numext::floor(q); if (p == q) { return maxnum; } @@ -309,7 +309,7 @@ struct digamma_impl { p += one; nz = q - p; } - nz = m_pi / ::tan(m_pi * nz); + nz = m_pi / numext::tan(m_pi * nz); } else { nz = zero; @@ -327,7 +327,7 @@ struct digamma_impl { y = digamma_impl_maybe_poly::run(s); - y = ::log(s) - (half / s) - y - w; + y = numext::log(s) - (half / s) - y - w; return (negative) ? y - nz : y; } @@ -401,6 +401,327 @@ struct erfc_impl { }; #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral) * + ****************************************************************************/ + +template +struct igammac_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template struct igamma_impl; // predeclare igamma_impl + +template +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar big() { assert(false && "big not supported for this type"); return 0.0; } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0 / (NumTraits::epsilon() / 2); + } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double big() { + return 1.0 / NumTraits::epsilon(); + } +}; + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = igamma_helper::machep(); + const Scalar maxlog = numext::log(NumTraits::highest()); + const Scalar big = igamma_helper::big(); + const Scalar biginv = 1 / big; + const Scalar nan = NumTraits::quiet_NaN(); + const Scalar inf = NumTraits::infinity(); + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if ((x < zero) || ( a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + return (one - igamma_impl::run(a, x)); + } + + if (x == inf) return zero; // std::isinf crashes on CUDA + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { // underflow + return zero; + } + ax = numext::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1 / qkm1; + + while (true) { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = numext::abs((ans - r) / r); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if (t <= machep) break; + } + + return (ans * ax); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of igamma (incomplete gamma integral) * + ****************************************************************************/ + +template +struct igamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = igamma_helper::machep(); + const Scalar maxlog = numext::log(NumTraits::highest()); + const Scalar nan = NumTraits::quiet_NaN(); + + double ans, ax, c, r; + + if (x == zero) return zero; + + if ((x < zero) || ( a <= zero)) { // domain error + return nan; + } + + if ((x > one) && (x > a)) { + return (one - igammac_impl::run(a, x)); + } + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { + // underflow + return zero; + } + ax = numext::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + while (true) { + r += one; + c *= x/r; + ans += c; + if (c/ans <= machep) break; + } + + return (ans * ax / a); + } +}; + +#endif // EIGEN_HAS_C99_MATH + } // end namespace internal namespace numext { @@ -429,8 +750,21 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); } +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + } // end namespace numext + } // end namespace Eigen #endif // EIGEN_SPECIAL_FUNCTIONS_H -- cgit v1.2.3 From 73220d2bb044e5f01b3eee4838c5b32ca679f027 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Tue, 8 Mar 2016 17:28:21 -0800 Subject: Resolve bad merge. --- Eigen/src/Core/GenericPacketMath.h | 10 ++ Eigen/src/Core/GlobalFunctions.h | 30 +++++ Eigen/src/Core/NumTraits.h | 5 + Eigen/src/Core/arch/CUDA/MathFunctions.h | 36 ++++++ Eigen/src/Core/functors/BinaryFunctors.h | 49 ++++++++ Eigen/src/Core/util/ForwardDeclarations.h | 2 + Eigen/src/Core/util/Meta.h | 5 + cmake/EigenTesting.cmake | 7 +- test/array.cpp | 51 +++++++- unsupported/test/cxx11_tensor_cuda.cu | 201 ++++++++++++++++++++++++++++++ 10 files changed, 393 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 02882bdea..802def51d 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -78,6 +78,8 @@ struct default_packet_traits HasDiGamma = 0, HasErf = 0, HasErfc = 0, + HasIGamma = 0, + HasIGammac = 0, HasRound = 0, HasFloor = 0, @@ -457,6 +459,14 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 396da8e71..7df0fdda9 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -129,6 +129,36 @@ namespace Eigen ); } + /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + + /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + namespace internal { EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 6a596bb7d..7ddb4a867 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -95,6 +95,11 @@ template struct GenericNumTraits static inline T infinity() { return numext::numeric_limits::infinity(); } + + EIGEN_DEVICE_FUNC + static inline T quiet_NaN() { + return numext::numeric_limits::quiet_NaN(); + } }; template struct NumTraits : GenericNumTraits diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index a2c06a817..6822700f8 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -117,6 +117,42 @@ double2 perfc(const double2& a) } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + #endif } // end namespace internal diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 4962d625c..5cdfff845 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -337,6 +337,55 @@ template<> struct functor_traits { }; }; +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template struct scalar_igamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template struct scalar_igammac_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGammac + }; +}; //---------- binary functors bound to a constant, thus appearing as a unary functor ---------- diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index f09632375..a102e5457 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -206,6 +206,8 @@ template struct scalar_add_op; template struct scalar_constant_op; template struct scalar_identity_op; template struct scalar_sign_op; +template struct scalar_igamma_op; +template struct scalar_igammac_op; template struct scalar_product_op; template struct scalar_multiple2_op; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6b35179f2..24e8a6d8a 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -148,6 +148,7 @@ template struct numeric_limits static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } + static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } }; template<> struct numeric_limits { @@ -159,6 +160,8 @@ template<> struct numeric_limits static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC static float infinity() { return CUDART_INF_F; } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { return CUDART_NAN_F; } }; template<> struct numeric_limits { @@ -170,6 +173,8 @@ template<> struct numeric_limits static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC static double infinity() { return CUDART_INF; } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { return CUDART_NAN; } }; template<> struct numeric_limits { diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 5022397a7..5ca800cfe 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,12 +19,15 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - cuda_add_executable(${targetname} ${filename}) + if (${ARGC} GREATER 2) + cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + else() + cuda_add_executable(${targetname} ${filename}) + endif() else() add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") add_dependencies(eigen2_buildtests ${targetname}) else() diff --git a/test/array.cpp b/test/array.cpp index 96aef31c7..c61bfc8ed 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -295,7 +295,6 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square()); VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square()); VERIFY_IS_APPROX(pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0))); - VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt()); VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt()); @@ -305,6 +304,14 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(log10(m3), log(m3)/log(10)); + // Smoke test to check any compilation issues + ArrayType m1_abs_p1 = m1.abs() + 1; + ArrayType m2_abs_p1 = m2.abs() + 1; + VERIFY_IS_APPROX(Eigen::igamma(m1_abs_p1, m2_abs_p1), Eigen::igamma(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m1_abs_p1, m2_abs_p1), Eigen::igammac(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igamma(m2_abs_p1, m1_abs_p1), Eigen::igamma(m2_abs_p1, m1_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m2_abs_p1, m1_abs_p1), Eigen::igammac(m2_abs_p1, m1_abs_p1)); + // scalar by array division const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); @@ -323,6 +330,48 @@ template void array_real(const ArrayType& m) std::numeric_limits::infinity()); VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + // location i*6+j corresponds to a_s[i], x_s[j]. + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + } + + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } + } + } } #endif // EIGEN_HAS_C99_MATH diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 58da21d3b..1964d9e07 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -574,6 +574,191 @@ void test_cuda_lgamma(const Scalar stddev) cudaFree(d_out); } +template +void test_cuda_digamma() +{ + Tensor in(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits::infinity(); + expected_out(6) = std::numeric_limits::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } +} + +template +void test_cuda_igamma() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } +} + +template +void test_cuda_igammac() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } +} + template void test_cuda_erf(const Scalar stddev) { @@ -667,30 +852,46 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_2d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); CALL_SUBTEST_4(test_cuda_erf(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); // CALL_SUBTEST(test_cuda_erfc(100.0f)); CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01f)); CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); CALL_SUBTEST_4(test_cuda_lgamma(100.0)); CALL_SUBTEST_4(test_cuda_lgamma(0.01)); CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0)); CALL_SUBTEST_4(test_cuda_erf(100.0)); CALL_SUBTEST_4(test_cuda_erf(0.01)); CALL_SUBTEST_4(test_cuda_erf(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(1.0)); // CALL_SUBTEST(test_cuda_erfc(100.0)); CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01)); CALL_SUBTEST_4(test_cuda_erfc(0.001)); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); } -- cgit v1.2.3 From 6d306831133f3b40b1946beac98d6fe083295126 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 21:02:51 -0800 Subject: Fixed static assertion --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 52d5b7b1a..02d6646d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -36,8 +36,8 @@ struct TensorUInt128 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128& other) : high(other.high), low(other.low) { - EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide"); - EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide"); + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); } template -- cgit v1.2.3 From b084133dbfa089b8a8b6998df2c118ce3c20a7d5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 07:06:36 -0800 Subject: Fixed the integer division code on windows --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 75567f1ff..33c6c1b0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -36,7 +36,13 @@ namespace { #ifdef __CUDA_ARCH__ return (sizeof(T) == 8) ? __clzll(val) : __clz(val); #elif EIGEN_COMP_MSVC - return (sizeof(T) == 8) ? __lzcnt64(val) : __lzcnt(val); + unsigned long index; + if (sizeof(T) == 8) { + _BitScanReverse64(&index, val); + } else { + _BitScanReverse(&index, val); + } + return (sizeof(T) == 8) ? 63 - index : 31 - index; #else EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); return (sizeof(T) == 8) ? -- cgit v1.2.3 From 836e92a051f9f56801c52d603402aaa5e74a7a6f Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 9 Mar 2016 09:04:45 -0800 Subject: Update MathFunctions/SpecialFunctions with intelligent header guards. --- Eigen/src/Core/MathFunctions.h | 14 ++++++++++---- Eigen/src/Core/SpecialFunctions.h | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 0be6de71f..311808187 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -946,12 +946,13 @@ T (floor)(const T& x) return floor(x); } +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double &x) { return ::floor(x); } - +#endif template EIGEN_DEVICE_FUNC @@ -999,12 +1000,13 @@ T log(const T &x) { return log(x); } +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double &x) { return ::log(x); } - +#endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE @@ -1013,12 +1015,13 @@ T tan(const T &x) { return tan(x); } +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double &x) { return ::tan(x); } - +#endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE @@ -1027,12 +1030,13 @@ T abs(const T &x) { return abs(x); } +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double &x) { return ::fabs(x); } - +#endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE @@ -1041,11 +1045,13 @@ T exp(const T &x) { return exp(x); } +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } +#endif } // end namespace numext diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index c01a88d18..c12e41a7b 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -284,7 +284,7 @@ struct digamma_impl { bool negative; const Scalar maxnum = NumTraits::infinity(); - const Scalar m_pi = 3.14159265358979323846; + const Scalar m_pi = EIGEN_PI; negative = 0; nz = 0.0; -- cgit v1.2.3 From 1d566417d29ff5baf8164826b389aa6dad71857a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 10:55:11 -0800 Subject: Enable the random number generators when compiling with visual studio --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b24f06df8..8e45ae9e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -359,7 +359,7 @@ template class UniformRandomGenerator { bool m_deterministic; }; -#if __cplusplus > 199711 +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 template <> class UniformRandomGenerator { public: static const bool PacketAccess = true; @@ -565,7 +565,7 @@ struct functor_traits > { -#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900) // We're not compiling a cuda kernel template class NormalRandomGenerator { public: -- cgit v1.2.3 From f05fb449b845c5246da6817bb5300030b852318f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 15:27:45 -0800 Subject: Avoid unnecessary conversion from 32bit int to 64bit unsigned int --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2d7fb80d4..3f1864fa7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -247,7 +247,7 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } - const int num_threads = device.numThreads(); + const std::size_t num_threads = device.numThreads(); if (num_threads == 1) { *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; -- cgit v1.2.3 From b2100b83ad3fab4a96cdcdf6cde58d809d105ac3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 16:03:16 -0800 Subject: Made sure to include the header file when compiling with visual studio --- unsupported/Eigen/CXX11/Tensor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 3b5be4426..969f25481 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -38,7 +38,7 @@ typedef unsigned __int64 uint64_t; #include #endif -#if __cplusplus > 199711 +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 #include #endif -- cgit v1.2.3 From 3149b5b1481a60161e5dc767183a621c70552a19 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 17:35:17 -0800 Subject: Avoid implicit cast --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 3f1864fa7..fe1dc22ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -298,7 +298,7 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } - const int num_threads = device.numThreads(); + const std::size_t num_threads = device.numThreads(); if (num_threads == 1) { *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; -- cgit v1.2.3 From a685a6beedc3fbd292ede7b7582545bce243d64e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 17:41:52 -0800 Subject: Made the list reductions less ambiguous. --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 4d99f786c..c582e21f5 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -259,22 +259,20 @@ template< template< typename Reducer, - typename A, - typename... Ts -> struct reduce + typename A +> struct reduce { - constexpr static inline A run(A a, Ts...) { return a; } + constexpr static inline A run(A a) { return a; } }; template< typename Reducer, typename A, - typename B, typename... Ts -> struct reduce +> struct reduce { - constexpr static inline auto run(A a, B b, Ts... ts) -> decltype(Reducer::run(a, reduce::run(b, ts...))) { - return Reducer::run(a, reduce::run(b, ts...)); + constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + return Reducer::run(a, reduce::run(ts...)); } }; -- cgit v1.2.3 From 8fd42413779b41c1fb849045c208335be7e75597 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Mar 2016 02:28:46 +0000 Subject: Fixed a typo. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index c38747532..392cb6e3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -318,7 +318,7 @@ class BaseTensorContractionMapperm_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { -- cgit v1.2.3 From 86d45a3c837ba289d9c7d2e36caefe69db256206 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Mar 2016 21:29:39 -0800 Subject: Worked around visual studio compilation warnings. --- unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index ad2a1e6ac..2d223140e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -60,7 +60,7 @@ struct Initializer { typedef typename traits::Scalar InitList; static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>*/* indices*/, + Eigen::array::Index, traits::NumDimensions>*, const InitList& v) { tensor.coeffRef(0) = v; } -- cgit v1.2.3 From 456e038a4e50c9297489f51de42b8ba126a77709 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Mar 2016 15:17:44 -0800 Subject: Fixed the +=, -=, *= and /= operators to return a reference --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 720155ce1..9e3c51d49 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -38,19 +38,19 @@ __device__ half operator / (const half& a, const half& b) { __device__ half operator - (const half& a) { return __hneg(a); } -__device__ half operator += (half& a, const half& b) { +__device__ half& operator += (half& a, const half& b) { a = a + b; return a; } -__device__ half operator *= (half& a, const half& b) { +__device__ half& operator *= (half& a, const half& b) { a = a * b; return a; } -__device__ half operator -= (half& a, const half& b) { +__device__ half& operator -= (half& a, const half& b) { a = a - b; return a; } -__device__ half operator /= (half& a, const half& b) { +__device__ half& operator /= (half& a, const half& b) { a = a / b; return a; } -- cgit v1.2.3 From c5b98a58b8ea018be71cdc03f7060d2b96184a84 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 11:53:38 -0800 Subject: Updated the cxx11_meta test to work on the Eigen::array class when std::array isn't available. --- unsupported/test/cxx11_meta.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index f9179bbe6..62db0e71e 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -294,8 +294,8 @@ static void test_arg_reductions() static void test_array_reverse_and_reduce() { - std::array a{{4, 8, 15, 16, 23, 42}}; - std::array b{{42, 23, 16, 15, 8, 4}}; + array a{{4, 8, 15, 16, 23, 42}}; + array b{{42, 23, 16, 15, 8, 4}}; // there is no operator<< for std::array, so VERIFY_IS_EQUAL will // not compile @@ -309,11 +309,11 @@ static void test_array_reverse_and_reduce() static void test_array_zip_and_apply() { - std::array a{{4, 8, 15, 16, 23, 42}}; - std::array b{{0, 1, 2, 3, 4, 5}}; - std::array c{{4, 9, 17, 19, 27, 47}}; - std::array d{{0, 8, 30, 48, 92, 210}}; - std::array e{{0, 2, 4, 6, 8, 10}}; + array a{{4, 8, 15, 16, 23, 42}}; + array b{{0, 1, 2, 3, 4, 5}}; + array c{{4, 9, 17, 19, 27, 47}}; + array d{{0, 8, 30, 48, 92, 210}}; + array e{{0, 2, 4, 6, 8, 10}}; VERIFY((array_zip(a, b) == c)); VERIFY((array_zip(a, b) == d)); @@ -326,8 +326,8 @@ static void test_array_zip_and_apply() static void test_array_misc() { - std::array a3{{1, 1, 1}}; - std::array a6{{2, 2, 2, 2, 2, 2}}; + array a3{{1, 1, 1}}; + array a6{{2, 2, 2, 2, 2, 2}}; VERIFY((repeat<3, int>(1) == a3)); VERIFY((repeat<6, int>(2) == a6)); -- cgit v1.2.3 From 25f69cb932f05b8509df14d14d2779a20fc9b091 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 15:20:37 -0800 Subject: Added a comparison operator for Eigen::array Alias Eigen::array to std::array when compiling with Visual Studio 2015 --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index eae8b996c..894b22009 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if __cplusplus <= 199711L || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template class array { @@ -177,6 +177,19 @@ template class array { T dummy; }; +// Comparison operator +// Todo: implement !=, <, <=, >, and >= +template +bool operator==(const array& lhs, const array& rhs) { + for (std::size_t i = 0; i < N; ++i) { + if (lhs[i] != rhs[i]) { + return false; + } + } + return true; +} + + namespace internal { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { -- cgit v1.2.3 From b72ffcb05e2abd631a0302061e06972f5fe8b0cc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 16:37:59 -0800 Subject: Made the comparison of Eigen::array GPU friendly --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 894b22009..efe688e50 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -180,7 +180,7 @@ template class array { // Comparison operator // Todo: implement !=, <, <=, >, and >= template -bool operator==(const array& lhs, const array& rhs) { +EIGEN_DEVICE_FUNC bool operator==(const array& lhs, const array& rhs) { for (std::size_t i = 0; i < N; ++i) { if (lhs[i] != rhs[i]) { return false; -- cgit v1.2.3 From 048c4d6efd34ae26cebf5a6b662d4480dfe61f0e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 17:21:42 -0800 Subject: Made half floats usable on hardware that doesn't support them natively. --- Eigen/Core | 2 + Eigen/src/Core/arch/CUDA/Half.h | 337 +++++++++++++++++++++ Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 46 --- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 16 +- 4 files changed, 343 insertions(+), 58 deletions(-) create mode 100644 Eigen/src/Core/arch/CUDA/Half.h diff --git a/Eigen/Core b/Eigen/Core index 7107f83d0..8428c51e4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -331,6 +331,8 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #endif +#include "src/Core/arch/CUDA/Half.h" + #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" #include "src/Core/arch/CUDA/PacketMathHalf.h" diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h new file mode 100644 index 000000000..419790c3e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -0,0 +1,337 @@ +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// class Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. +// +// +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if !defined(EIGEN_HAS_CUDA_FP16) + +// Make our own __half definition that is similar to CUDA's. +struct __half { + uint16_t x; +}; + +#endif + +namespace Eigen { + +namespace internal { + +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x); +static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); + +} // end namespace internal + +// Class definition. +struct half : public __half { + EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} + + // TODO(sesse): Should these conversions be marked as explicit? + EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} + EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(i)) {} + EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(d)) {} + EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const volatile half& h) + : __half(internal::raw_uint16_to_half(h.x)) {} + + EIGEN_DEVICE_FUNC explicit operator float() const { + return internal::half_to_float(*this); + } + EIGEN_DEVICE_FUNC explicit operator double() const { + return internal::half_to_float(*this); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } + EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) { + x = other.x; + return *this; + } + EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile { + x = other.x; + return *this; + } +}; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} +__device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +__device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +__device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +__device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +__device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +__device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +__device__ bool operator < (const half& a, const half& b) { + return __hle(a, b); +} +__device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} + +#else // Not CUDA 530 + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +static inline EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +static inline EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return float(a) == float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return float(a) != float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} + +#endif // Not CUDA 530 + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +namespace internal { + +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x) { + __half h; + h.x = x; + return h; +} + +union FP32 { + uint32_t u; + float f; +}; + +static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) + return __float2half(ff); +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + uint32_t sign_mask = 0x80000000u; + __half o = { 0 }; + + uint32_t sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = f.u - denorm_magic.u; + } else { + uint32_t mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((uint32_t)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = f.u >> 13; + } + } + + o.x |= sign >> 16; + return o; +#endif +} + +static inline EIGEN_DEVICE_FUNC float half_to_float(__half h) { +#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) + return __half2float(h); +#else + const FP32 magic = { 113 << 23 }; + const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +} // end namespace internal + +// Traits. + +namespace internal { + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +// Infinity/NaN checks. + +namespace numext { + +static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(x); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} + +} // end namespace numext + +} // end namespace Eigen + +// Standard mathematical functions and trancendentals. + +namespace std { + +static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return Eigen::half(expf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return Eigen::half(logf(float(a))); +} + +} // end namespace std + +#endif // EIGEN_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 9e3c51d49..9e1d87062 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -19,55 +19,9 @@ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 -// The following operations require arch >= 5.3 -#if __CUDA_ARCH__ >= 530 -__device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); -} -__device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -} -__device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); -} -__device__ half operator / (const half& a, const half& b) { - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -} -__device__ half operator - (const half& a) { - return __hneg(a); -} -__device__ half& operator += (half& a, const half& b) { - a = a + b; - return a; -} -__device__ half& operator *= (half& a, const half& b) { - a = a * b; - return a; -} -__device__ half& operator -= (half& a, const half& b) { - a = a - b; - return a; -} -__device__ half& operator /= (half& a, const half& b) { - a = a / b; - return a; -} - -namespace std { -__device__ half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -} -#endif - namespace Eigen { namespace internal { -template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct packet_traits : default_packet_traits diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 7936a9126..d9ed4c855 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -19,30 +19,28 @@ using Eigen::Tensor; -#ifdef EIGEN_HAS_CUDA_FP16 - void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; - + Tensor floats(num_elem); floats.setRandom(); float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_half( + Eigen::TensorMap, Eigen::Aligned> gpu_half( d_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_conv( d_conv, num_elem); gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); - gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_half.device(gpu_device) = gpu_float.cast(); gpu_conv.device(gpu_device) = gpu_half.cast(); Tensor initial(num_elem); @@ -60,14 +58,8 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } -#endif - void test_cxx11_tensor_cast_float16_cuda() { -#ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST(test_cuda_conversion()); -#else - std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; -#endif } -- cgit v1.2.3 From 0423b66187fc4c9dba69ee7ab77b476171b8a12a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 17:53:41 -0800 Subject: Use unsigned short instead of uint16_t since they're more portable --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 419790c3e..c243dc32a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -39,7 +39,7 @@ // Make our own __half definition that is similar to CUDA's. struct __half { - uint16_t x; + unsigned short x; }; #endif -- cgit v1.2.3 From 1ca8c1ec97c59034ed10b37974de2dc34971d08a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 19:28:28 -0800 Subject: Replaced a couple more uint16_t with unsigned short --- Eigen/src/Core/arch/CUDA/Half.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index c243dc32a..97c5b4c3e 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -48,7 +48,7 @@ namespace Eigen { namespace internal { -static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x); +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); @@ -203,7 +203,7 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { namespace internal { -static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x) { +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { __half h; h.x = x; return h; -- cgit v1.2.3 From eecd91486470bdebe7238d9f536741e94fefa4b3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 19:34:21 -0800 Subject: Also replaced uint32_t with unsigned int to make the code more portable --- Eigen/src/Core/arch/CUDA/Half.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 97c5b4c3e..4e5d9ad68 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -210,7 +210,7 @@ static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { } union FP32 { - uint32_t u; + unsigned int u; float f; }; @@ -223,10 +223,10 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { const FP32 f32infty = { 255 << 23 }; const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; - uint32_t sign_mask = 0x80000000u; + unsigned int sign_mask = 0x80000000u; __half o = { 0 }; - uint32_t sign = f.u & sign_mask; + unsigned int sign = f.u & sign_mask; f.u ^= sign; // NOTE all the integer compares in this function can be safely @@ -246,10 +246,10 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { // and one integer subtract of the bias later, we have our final float! o.x = f.u - denorm_magic.u; } else { - uint32_t mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd // update exponent, rounding bias part 1 - f.u += ((uint32_t)(15 - 127) << 23) + 0xfff; + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; // rounding bias part 2 f.u += mant_odd; // take the bits! @@ -267,17 +267,17 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h) { return __half2float(h); #else const FP32 magic = { 113 << 23 }; - const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift FP32 o; - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - uint32_t exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? + if (exp == shifted_exp) { // Inf/NaN? o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? + } else if (exp == 0) { // Zero/Denormal? o.u += 1 << 23; // extra exp adjust o.f -= magic.f; // renormalize } -- cgit v1.2.3 From e29c9676b1a7f0e82af0c4c4c63cfe16813340ad Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 12 Mar 2016 00:15:58 -0800 Subject: Don't mark the cast operator as explicit, since this is a c++11 feature that's not supported by older compilers. --- Eigen/src/Core/arch/CUDA/Half.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 4e5d9ad68..5ce2be165 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -69,10 +69,10 @@ struct half : public __half { EIGEN_DEVICE_FUNC half(const volatile half& h) : __half(internal::raw_uint16_to_half(h.x)) {} - EIGEN_DEVICE_FUNC explicit operator float() const { + EIGEN_DEVICE_FUNC operator float() const { return internal::half_to_float(*this); } - EIGEN_DEVICE_FUNC explicit operator double() const { + EIGEN_DEVICE_FUNC operator double() const { return internal::half_to_float(*this); } -- cgit v1.2.3 From b1a9afe9a9a7076d3a56643e85164f39af264abd Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Sun, 13 Mar 2016 15:45:34 -0700 Subject: Add tests in array.cpp that check igamma/igammac properties. This adds to the set of existing tests, which compare a specific set of values to third party calculated ground truth. --- test/array.cpp | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/test/array.cpp b/test/array.cpp index c61bfc8ed..d05744c4a 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -304,22 +304,14 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(log10(m3), log(m3)/log(10)); - // Smoke test to check any compilation issues - ArrayType m1_abs_p1 = m1.abs() + 1; - ArrayType m2_abs_p1 = m2.abs() + 1; - VERIFY_IS_APPROX(Eigen::igamma(m1_abs_p1, m2_abs_p1), Eigen::igamma(m1_abs_p1, m2_abs_p1)); - VERIFY_IS_APPROX(Eigen::igammac(m1_abs_p1, m2_abs_p1), Eigen::igammac(m1_abs_p1, m2_abs_p1)); - VERIFY_IS_APPROX(Eigen::igamma(m2_abs_p1, m1_abs_p1), Eigen::igamma(m2_abs_p1, m1_abs_p1)); - VERIFY_IS_APPROX(Eigen::igammac(m2_abs_p1, m1_abs_p1), Eigen::igammac(m2_abs_p1, m1_abs_p1)); - // scalar by array division const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); - // check special functions (comparing against numpy implementation) #ifdef EIGEN_HAS_C99_MATH + // check special functions (comparing against numpy implementation) if (!NumTraits::IsComplex) { VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329)); VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645)); @@ -331,6 +323,37 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); + { + // Test various propreties of igamma & igammac. These are normalized + // gamma integrals where + // igammac(a, x) = Gamma(a, x) / Gamma(a) + // igamma(a, x) = gamma(a, x) / Gamma(a) + // where Gamma and gamma are considered the standard unnormalized + // upper and lower incomplete gamma functions, respectively. + ArrayType a = m1.abs() + 2; + ArrayType x = m2.abs() + 2; + ArrayType zero = ArrayType::Zero(rows, cols); + ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0)); + ArrayType a_m1 = a - one; + ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp(); + ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp(); + ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp(); + ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp(); + + // Gamma(a, 0) == Gamma(a) + VERIFY_IS_APPROX(Eigen::igammac(a, zero), one); + + // Gamma(a, x) + gamma(a, x) == Gamma(a) + VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp()); + + // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x) + VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp()); + + // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x) + VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp()); + } + + // Check exact values of igamma and igammac against a third party calculation. Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; -- cgit v1.2.3 From 97a1f1c2735e0f393b8492485f3db63cea4ba7c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Mar 2016 08:37:58 -0700 Subject: Make sure we only use the half float intrinsic when compiling with a version of CUDA that is recent enough to provide them --- Eigen/src/Core/arch/CUDA/Half.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 5ce2be165..35e216028 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -90,7 +90,7 @@ struct half : public __half { } }; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -143,7 +143,7 @@ __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } -#else // Not CUDA 530 +#else // Emulate support for half floats // Definitions for CPUs and older CUDA, mostly working through conversion // to/from fp32. @@ -194,7 +194,7 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { return float(a) > float(b); } -#endif // Not CUDA 530 +#endif // Emulate support for half floats // Conversion routines, including fallbacks for the host or older CUDA. // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of -- cgit v1.2.3 From fcf59e1c37b48130eec5d3e26c847c7bb252542d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Mar 2016 09:13:44 -0700 Subject: Properly gate the use of cuda intrinsics in the code --- Eigen/src/Core/arch/CUDA/Half.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 35e216028..af5d872a5 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -215,7 +215,7 @@ union FP32 { }; static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(ff); #else FP32 f; f.f = ff; @@ -263,7 +263,7 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { } static inline EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(h); #else const FP32 magic = { 113 << 23 }; @@ -305,7 +305,7 @@ static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { return (a.x & 0x7fff) == 0x7c00; } static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(x); #else return (a.x & 0x7fff) > 0x7c00; -- cgit v1.2.3 From 5a51366ea536368079b906c298de9dac3cad33a6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Mar 2016 09:25:16 -0700 Subject: Fixed a typo. --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index af5d872a5..953a3a77a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -306,7 +306,7 @@ static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { } static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hisnan(x); + return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; #endif -- cgit v1.2.3 From ab9b749b458325d8db5833c1491635044c495bc2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Mar 2016 20:03:13 -0700 Subject: Improved a test --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 9b9fd843c..29b5637e7 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -27,18 +27,18 @@ void test_cuda_conversion() { int num_elem = 101; float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_half( + Eigen::TensorMap, Eigen::Aligned> gpu_half( d_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_conv( d_conv, num_elem); gpu_float.device(gpu_device) = gpu_float.random(); - gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_half.device(gpu_device) = gpu_float.cast(); gpu_conv.device(gpu_device) = gpu_half.cast(); Tensor initial(num_elem); @@ -72,9 +72,9 @@ void test_cuda_unary() { Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); - gpu_float.device(gpu_device) = gpu_float.random(); + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -115,7 +115,7 @@ void test_cuda_elementwise() { gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; - gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -162,7 +162,7 @@ void test_cuda_contractions() { typedef Tensor::DimensionPair DimPair; Eigen::array dims(DimPair(1, 0)); gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); - gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); Tensor half_prec(rows, cols); Tensor full_prec(rows, cols); @@ -209,7 +209,7 @@ void test_cuda_reductions() { Eigen::array redux_dim = {{0}}; gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); Tensor half_prec(size); Tensor full_prec(size); -- cgit v1.2.3 From 1f69a1b65fd5cbf6610e7d6522d57b4cb9acd686 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 16 Mar 2016 12:44:35 -0700 Subject: Change the header guard around certain numext functions to be CUDA specific. --- Eigen/src/Core/MathFunctions.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 311808187..ec75175ca 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -946,7 +946,7 @@ T (floor)(const T& x) return floor(x); } -#ifdef EIGEN_HAS_C99_MATH +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1000,7 +1000,7 @@ T log(const T &x) { return log(x); } -#ifdef EIGEN_HAS_C99_MATH +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1015,7 +1015,7 @@ T tan(const T &x) { return tan(x); } -#ifdef EIGEN_HAS_C99_MATH +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1030,7 +1030,7 @@ T abs(const T &x) { return abs(x); } -#ifdef EIGEN_HAS_C99_MATH +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1045,7 +1045,7 @@ T exp(const T &x) { return exp(x); } -#ifdef EIGEN_HAS_C99_MATH +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } -- cgit v1.2.3 From f7329619da8d493fecc30e2a5d44bc3a672741a3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Mar 2016 15:08:02 -0700 Subject: Fix bug in tensor contraction. The code assumes that contraction axis indices for the LHS (after possibly swapping to ColMajor!) is increasing. Explicitly sort the contraction axis pairs to make it so. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 13 +++++++++++++ unsupported/test/cxx11_tensor_contraction.cpp | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 18b20b2dc..f070ba61e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -193,6 +193,19 @@ struct TensorContractionEvaluatorBase } } + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && + "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + array lhs_strides; lhs_strides[0] = 1; for (int i = 0; i < LDims-1; ++i) { diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 57ec5add7..0e16308a2 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -138,6 +138,26 @@ static void test_multidims() mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1)); VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) + mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); + + Tensor mat4(2, 2); + Tensor mat5(2, 2, 2); + + mat4.setRandom(); + mat5.setRandom(); + + Tensor mat6(2); + mat6.setZero(); + Eigen::array dims2({{DimPair(0, 1), DimPair(1, 0)}}); + typedef TensorEvaluator Evaluator2; + Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice()); + eval2.evalTo(mat6.data()); + EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); + + VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) + + mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0)); + VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) + + mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1)); } template -- cgit v1.2.3 From 95b8961a9b2ac3e063ba9ddb7ac8515e24cae6c2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Mar 2016 15:23:51 -0700 Subject: Allocate the mersenne twister used by the random number generators on the heap instead of on the stack since they tend to keep a lot of state (i.e. about 5k) around. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 49 ++++++++++++++-------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 8e45ae9e5..26e5dafce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -364,19 +364,23 @@ template <> class UniformRandomGenerator { public: static const bool PacketAccess = true; - UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { + UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) { if (!deterministic) { - m_generator.seed(get_random_seed()); + m_generator->seed(get_random_seed()); } } UniformRandomGenerator(const UniformRandomGenerator& other) { - m_generator.seed(other(0) * UINT_MAX); + m_generator = new std::mt19937(); + m_generator->seed(other(0) * UINT_MAX); m_deterministic = other.m_deterministic; } + ~UniformRandomGenerator() { + delete m_generator; + } template float operator()(Index) const { - return m_distribution(m_generator); + return m_distribution(*m_generator); } template PacketType packetOp(Index i) const { @@ -393,7 +397,7 @@ template <> class UniformRandomGenerator { // Make sure m_deterministic comes first to match the layout of the cpu // version of the code. bool m_deterministic; - mutable std::mt19937 m_generator; + mutable std::mt19937* m_generator; mutable std::uniform_real_distribution m_distribution; }; @@ -401,19 +405,23 @@ template <> class UniformRandomGenerator { public: static const bool PacketAccess = true; - UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { + UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) { if (!deterministic) { - m_generator.seed(get_random_seed()); + m_generator->seed(get_random_seed()); } } UniformRandomGenerator(const UniformRandomGenerator& other) { - m_generator.seed(other(0) * UINT_MAX); + m_generator = new std::mt19937(); + m_generator->seed(other(0) * UINT_MAX); m_deterministic = other.m_deterministic; } + ~UniformRandomGenerator() { + delete m_generator; + } template double operator()(Index) const { - return m_distribution(m_generator); + return m_distribution(*m_generator); } template PacketType packetOp(Index i) const { @@ -430,7 +438,7 @@ template <> class UniformRandomGenerator { // Make sure m_deterministic comes first to match the layout of the cpu // version of the code. bool m_deterministic; - mutable std::mt19937 m_generator; + mutable std::mt19937* m_generator; mutable std::uniform_real_distribution m_distribution; }; #endif @@ -571,34 +579,39 @@ template class NormalRandomGenerator { public: static const bool PacketAccess = true; - NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1) { + NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) { if (!deterministic) { - m_generator.seed(get_random_seed()); + m_generator->seed(get_random_seed()); } } NormalRandomGenerator(const NormalRandomGenerator& other) - : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution) { - m_generator.seed(other(0) * UINT_MAX); + : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) { + m_generator->seed(other(0) * UINT_MAX); + } + ~NormalRandomGenerator() { + delete m_generator; } - template T operator()(Index) const { - return m_distribution(m_generator); + return m_distribution(*m_generator); } template PacketType packetOp(Index) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; for (int i = 0; i < packetSize; ++i) { - values[i] = m_distribution(m_generator); + values[i] = m_distribution(*m_generator); } return internal::pload(values); } private: + // No assignment + NormalRandomGenerator& operator = (const NormalRandomGenerator&); + bool m_deterministic; mutable std::normal_distribution m_distribution; - mutable std::mt19937 m_generator; + mutable std::mt19937* m_generator; }; #elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) -- cgit v1.2.3 From afb81b7ded11f7ef764334b266f8426669194a0f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Mar 2016 21:24:24 -0700 Subject: Made sure to use the hard abi when compiling with NEON instructions to avoid the "gnu/stubs-soft.h: No such file or directory" error --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c979747c..95f4c8d7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,7 +240,7 @@ if(NOT MSVC) else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard") message(STATUS "Enabling NEON in tests/examples") endif() -- cgit v1.2.3 From 7b98de1f15dd9f686e67b88c78708d4adc15adf5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Mar 2016 21:45:45 -0700 Subject: Implemented some of the missing type casting for half floats --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index b59b42170..10610ac44 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -24,8 +24,7 @@ struct scalar_cast_op { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(a); #else - assert(false && "tbd"); - return half(); + return half(a); #endif } }; @@ -43,8 +42,7 @@ struct scalar_cast_op { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(static_cast(a)); #else - assert(false && "tbd"); - return half(); + return half(static_cast(a)); #endif } }; @@ -62,8 +60,7 @@ struct scalar_cast_op { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(a); #else - assert(false && "tbd"); - return 0.0f; + return static_cast(a); #endif } }; -- cgit v1.2.3 From 70eb70f5f84a353c89be84e752aa32d66ba273b3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Mar 2016 21:47:18 -0700 Subject: Avoid mutable class members when possible --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 26e5dafce..c71a30d21 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -397,7 +397,7 @@ template <> class UniformRandomGenerator { // Make sure m_deterministic comes first to match the layout of the cpu // version of the code. bool m_deterministic; - mutable std::mt19937* m_generator; + std::mt19937* m_generator; mutable std::uniform_real_distribution m_distribution; }; @@ -438,7 +438,7 @@ template <> class UniformRandomGenerator { // Make sure m_deterministic comes first to match the layout of the cpu // version of the code. bool m_deterministic; - mutable std::mt19937* m_generator; + std::mt19937* m_generator; mutable std::uniform_real_distribution m_distribution; }; #endif @@ -611,7 +611,7 @@ template class NormalRandomGenerator { bool m_deterministic; mutable std::normal_distribution m_distribution; - mutable std::mt19937* m_generator; + std::mt19937* m_generator; }; #elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) -- cgit v1.2.3 From 53d498ef064dddf616af68468ac1cd5375618467 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 07:04:54 -0700 Subject: Fixed compilation warnings in the cuda tests --- unsupported/test/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e2e5470c8..79c26fb72 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -175,7 +175,11 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND) -# set(CUDA_PROPAGATE_HOST_FLAGS OFF) + # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime + string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() -- cgit v1.2.3 From edc679f6c695090b2af604bf4ca7cf2a297aad09 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 07:12:34 -0700 Subject: Fixed compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 444766f96..c33d54d6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { assert(false && "Should only be called on floats"); } -- cgit v1.2.3 From 9a7ece9cafc7d6a6044e4a98692981c539aa1706 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 10:38:29 -0700 Subject: Worked around constness issue --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 5023371ae..c21a98fe0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -242,8 +242,8 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - Scalar* result = m_impl.data(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && result) { -- cgit v1.2.3 From dd514de8a9fb64cb232a9d317346a2e06b5a328d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 12:02:39 -0700 Subject: Added a test to validate the fallback path for half floats --- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index d9ed4c855..fece57482 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -59,7 +59,22 @@ void test_cuda_conversion() { } +void test_fallback_conversion() { + int num_elem = 101; + Tensor floats(num_elem); + floats.setRandom(); + + Eigen::Tensor halfs = floats.cast(); + Eigen::Tensor conv = half.cast(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(floats(i), conv(i)); + } +} + + void test_cxx11_tensor_cast_float16_cuda() { CALL_SUBTEST(test_cuda_conversion()); + CALL_SUBTEST(test_fallback_conversion()); } -- cgit v1.2.3 From 2db4a048278db4e07eefa2ae9ad4ce7254c9faec Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 12:08:01 -0700 Subject: Fixed a typo --- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index fece57482..f22b99de8 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -65,7 +65,7 @@ void test_fallback_conversion() { floats.setRandom(); Eigen::Tensor halfs = floats.cast(); - Eigen::Tensor conv = half.cast(); + Eigen::Tensor conv = halfs.cast(); for (int i = 0; i < num_elem; ++i) { VERIFY_IS_APPROX(floats(i), conv(i)); -- cgit v1.2.3 From bb0e73c1914c675755e8fab2a6db168b65a4de51 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 12:17:37 -0700 Subject: Gate all the CUDA tests under the EIGEN_TEST_NVCC option --- cmake/EigenTesting.cmake | 6 ++++++ unsupported/test/CMakeLists.txt | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 5ca800cfe..c70ec2c24 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -309,6 +309,12 @@ macro(ei_testing_print_summary) message(STATUS "C++11: OFF") endif() + if(EIGEN_TEST_NVCC) + message(STATUS "CUDA: ON") + else() + message(STATUS "CUDA: OFF") + endif() + endif() # vectorization / alignment options message(STATUS "\n${EIGEN_TESTING_SUMMARY}") diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 79c26fb72..19893cc25 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -174,7 +174,7 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) -if(CUDA_FOUND) +if(CUDA_FOUND AND EIGEN_TEST_NVCC) # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -- cgit v1.2.3 From 7bd551b3a919dc48ba7d629e8dfed75a80b17e57 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 12:20:08 -0700 Subject: Make all the conversions explicit --- Eigen/src/Core/arch/CUDA/Half.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 953a3a77a..c385b882a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -35,6 +35,13 @@ #ifndef EIGEN_HALF_CUDA_H #define EIGEN_HALF_CUDA_H +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + #if !defined(EIGEN_HAS_CUDA_FP16) // Make our own __half definition that is similar to CUDA's. @@ -60,8 +67,8 @@ struct half : public __half { // TODO(sesse): Should these conversions be marked as explicit? EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} - EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(i)) {} - EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(d)) {} + EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast(i))) {} + EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast(d))) {} EIGEN_DEVICE_FUNC half(bool b) : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} @@ -69,10 +76,10 @@ struct half : public __half { EIGEN_DEVICE_FUNC half(const volatile half& h) : __half(internal::raw_uint16_to_half(h.x)) {} - EIGEN_DEVICE_FUNC operator float() const { + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); } - EIGEN_DEVICE_FUNC operator double() const { + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { return internal::half_to_float(*this); } @@ -244,7 +251,7 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { f.f += denorm_magic.f; // and one integer subtract of the bias later, we have our final float! - o.x = f.u - denorm_magic.u; + o.x = static_cast(f.u - denorm_magic.u); } else { unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd @@ -253,11 +260,11 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { // rounding bias part 2 f.u += mant_odd; // take the bits! - o.x = f.u >> 13; + o.x = static_cast(f.u >> 13); } } - o.x |= sign >> 16; + o.x |= static_cast(sign >> 16); return o; #endif } @@ -326,10 +333,10 @@ static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { return result; } static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { - return Eigen::half(expf(float(a))); + return Eigen::half(::expf(float(a))); } static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { - return Eigen::half(logf(float(a))); + return Eigen::half(::logf(float(a))); } } // end namespace std -- cgit v1.2.3 From 134d750eabac2e001258063c20d45603a18fd6f4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 13:36:28 -0700 Subject: Completed the implementation of vectorized type casting of half floats. --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 10610ac44..4c0433267 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -87,8 +87,16 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(con float2 r2 = __half22float2(b); return make_float4(r1.x, r1.y, r2.x, r2.y); #else - assert(false && "tbd"); - return float4(); + half r1; + r1.x = a.x & 0xFFFF; + half r2; + r2.x = (a.x & 0xFFFF0000) >> 16; + half r3; + r3.x = b.x & 0xFFFF; + half r4; + r4.x = (b.x & 0xFFFF0000) >> 16; + return make_float4(static_cast(r1), static_cast(r2), + static_cast(r3), static_cast(r4)); #endif } @@ -106,8 +114,13 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); #else - assert(false && "tbd"); - return half2(); + half r1 = a.x; + half r2 = a.y; + half2 r; + r.x = 0; + r.x |= r1.x; + r.x |= (static_cast(r2.x) << 16); + return r; #endif } -- cgit v1.2.3 From 6c08943d9fad838213d52d592221f1fd9aab09cb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 15:19:10 -0700 Subject: Fixed a bug in the padding of extracted image patches. --- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0008f9890..676a2c7e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -226,8 +226,8 @@ struct TensorEvaluator, Device> m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); // Calculate the padding - m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; - m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); break; case PADDING_SAME: m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); -- cgit v1.2.3 From 8e03333f06f4a55c5d9698caca208fa91dcc87f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Mar 2016 15:21:04 -0700 Subject: Renamed some class members to make the code more readable. --- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 676a2c7e7..72594a05c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -294,8 +294,8 @@ struct TensorEvaluator, Device> m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); m_fastColStride = internal::TensorIntDivisor(m_colStride); - m_fastInputRowStride = internal::TensorIntDivisor(m_row_inflate_strides); - m_fastInputColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); // Number of patches in the width dimension. @@ -336,7 +336,7 @@ struct TensorEvaluator, Device> const Index colIndex = patch2DIndex / m_fastOutputRows; const Index colOffset = patchOffset / m_fastColStride; const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); if (inputCol < 0 || inputCol >= m_input_cols_eff || ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { return Scalar(m_paddingValue); @@ -346,7 +346,7 @@ struct TensorEvaluator, Device> const Index rowIndex = patch2DIndex - colIndex * m_outputRows; const Index rowOffset = patchOffset - colOffset * m_colStride; const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); if (inputRow < 0 || inputRow >= m_input_rows_eff || ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { return Scalar(m_paddingValue); @@ -467,8 +467,8 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOtherStride; internal::TensorIntDivisor m_fastPatchStride; internal::TensorIntDivisor m_fastColStride; - internal::TensorIntDivisor m_fastInputRowStride; - internal::TensorIntDivisor m_fastInputColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; internal::TensorIntDivisor m_fastInputColsEff; Index m_rowInputStride; -- cgit v1.2.3 From b224771f403def7ade226a7410262361f495f668 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sun, 20 Mar 2016 10:57:08 +0100 Subject: bug #1178: Simplified modification of the SSE control register for better portability --- bench/btl/generic_bench/btl.hh | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index 92af1306a..706b00fb0 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -44,15 +44,10 @@ #define BTL_ASM_COMMENT(X) #endif -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && !defined(__arm__) && !defined(__powerpc__) -#define BTL_DISABLE_SSE_EXCEPTIONS() { \ - int aux = 0; \ - asm( \ - "stmxcsr %[aux] \n\t" \ - "orl $32832, %[aux] \n\t" \ - "ldmxcsr %[aux] \n\t" \ - : : [aux] "m" (aux)); \ -} +#ifdef __SSE__ +#include "xmmintrin.h" +// This enables flush to zero (FTZ) and denormals are zero (DAZ) modes: +#define BTL_DISABLE_SSE_EXCEPTIONS() { _mm_setcsr(_mm_getcsr() | 0x8040); } #else #define BTL_DISABLE_SSE_EXCEPTIONS() #endif -- cgit v1.2.3 From db5c14de424ef3b43c4afb1aedf6a6f8e5640a06 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 09:52:58 -0700 Subject: Explicitly cast the default value into the proper scalar type. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 9597577b9..6ee9c88b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -607,7 +607,7 @@ class TensorBase const TensorVolumePatchOp extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const { + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); } @@ -619,7 +619,7 @@ class TensorBase const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, const Index padding_top_z, const Index padding_bottom_z, const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const { + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); } -- cgit v1.2.3 From e91f25530117a30e1bf71387c9864e3ac601b9ba Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 10:02:00 -0700 Subject: Marked variables that's only used in debug mode as such --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 5d73d62d2..1fb27a65b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -111,6 +111,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { + EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } -- cgit v1.2.3 From a9a6710e151ccd5d4fa9a6178db4413ed0c74911 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 21 Mar 2016 13:46:47 -0400 Subject: add initial s390x(zEC13) ZVECTOR support --- CMakeLists.txt | 6 +++++- cmake/EigenTesting.cmake | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 95f4c8d7c..51beba118 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,7 +250,11 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() - + option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_ZVECTOR) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") + message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") + endif() check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index c70ec2c24..1709e0334 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -302,7 +302,13 @@ macro(ei_testing_print_summary) else() message(STATUS "ARMv8 NEON: Using architecture defaults") endif() - + + if(EIGEN_TEST_ZVECTOR) + message(STATUS "S390X ZVECTOR: ON") + else() + message(STATUS "S390X ZVECTOR: Using architecture defaults") + endif() + if(EIGEN_TEST_CXX11) message(STATUS "C++11: ON") else() @@ -446,6 +452,8 @@ macro(ei_get_cxxflags VAR) set(${VAR} NEON) elseif(EIGEN_TEST_NEON64) set(${VAR} NEON) + elseif(EIGEN_TEST_ZVECTOR) + set(${VAR} ZVECTOR) elseif(EIGEN_TEST_VSX) set(${VAR} VSX) elseif(EIGEN_TEST_ALTIVEC) -- cgit v1.2.3 From 7a07d6aa2bd2c6c5a9f93896ad34fba8bd9189fe Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 11:12:17 -0700 Subject: Small cleanup --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index afde7b3d2..e57ba9d9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; @@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; -- cgit v1.2.3 From 8ef3181f15a9be76ac783bedd2926ee6f4c69a2f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 11:24:05 -0700 Subject: Worked around a constness related issue --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e57ba9d9d..a9c222ea0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } const TensorEvaluator& impl() const { return m_impl; } -- cgit v1.2.3 From f9ad25e4d8453c4265a5fd6d4962a76a386564df Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 09:30:23 -0700 Subject: Fixed contractions of 16 bit floats --- Eigen/src/Core/arch/CUDA/Half.h | 20 ++++++++++---------- .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index c385b882a..921c5bcb2 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -73,8 +73,6 @@ struct half : public __half { : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} - EIGEN_DEVICE_FUNC half(const volatile half& h) - : __half(internal::raw_uint16_to_half(h.x)) {} EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); @@ -87,14 +85,6 @@ struct half : public __half { x = other.x; return *this; } - EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) { - x = other.x; - return *this; - } - EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile { - x = other.x; - return *this; - } }; #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -341,4 +331,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { } // end namespace std + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast(__shfl_xor(static_cast(var), laneMask, width)); +} + +#endif + + #endif // EIGEN_HALF_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index a4a06ab5f..dbff660a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -20,7 +20,7 @@ template __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, const Index m_size, const Index n_size, const Index k_size) { const Index m_block_idx = blockIdx.x; @@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, Scalar rrow(7); // Now x corresponds to k, y to m, and z to n - const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] @@ -503,8 +503,8 @@ __launch_bounds__(512) EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { - __shared__ volatile Scalar lhs_shmem[72 * 64]; - __shared__ volatile Scalar rhs_shmem[72 * 64]; + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; -- cgit v1.2.3 From 65a7113a36f70aeca34eac29f32b24ef865cb6e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 09:33:54 -0700 Subject: Use an enum instead of a static const int to prevent possible link error --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index d6ad65070..6af2d45d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -41,7 +41,7 @@ template <> struct max_n_1<0> { template struct PacketType { typedef typename internal::packet_traits::type type; - static const int size = internal::unpacket_traits::size; + enum { size = internal::unpacket_traits::size }; }; // For CUDA packet types when using a GpuDevice -- cgit v1.2.3 From 6a31b7be3ea29a5300ff575f0bada876b70904d6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:02:50 -0700 Subject: Avoid using std::vector whenever possible --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index dcbef5b03..b282f5c07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) { + explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } @@ -110,8 +110,8 @@ class ThreadPool : public ThreadPoolInterface { }; std::mutex mu_; - std::vector threads_; // All threads - std::vector waiters_; // Stack of waiting threads. + MaxSizeVector threads_; // All threads + MaxSizeVector waiters_; // Stack of waiting threads. std::deque> pending_; // Queue of pending work std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; -- cgit v1.2.3 From e7a468c5b78295e26d970372336bd5f73c90ae34 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:26:50 -0700 Subject: Filter some compilation flags that nvcc warns about. --- unsupported/test/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 19893cc25..20048515c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -175,9 +175,14 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND AND EIGEN_TEST_NVCC) - # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") -- cgit v1.2.3 From bc2b8027514b27b3c67800d5c951e5d532f76f02 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:27:34 -0700 Subject: Fixed a couple of typos --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index b282f5c07..4d803c95b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) { + explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } -- cgit v1.2.3 From 002cf0d1c979857e057879d8c84b92439dbcc90d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 15:24:23 -0700 Subject: Use a single Barrier instead of a collection of Notifications to reduce the thread synchronization overhead --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 75 +++++++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 10 +-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 25 +++----- 4 files changed, 73 insertions(+), 38 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 969f25481..16132398d 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_THREADS +#include #include #include #include diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index dcbef5b03..e4165bbf8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -118,47 +118,82 @@ class ThreadPool : public ThreadPoolInterface { }; -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object. -// but only one caller must call Notify() on the object. -class Notification { +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { public: - Notification() : notified_(false) {} - ~Notification() {} + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_assert((state_>>1) == 0); + } void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } std::unique_lock l(mu_); eigen_assert(!notified_); notified_ = true; cv_.notify_all(); } - void WaitForNotification() { + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; std::unique_lock l(mu_); - cv_.wait(l, [this]() { return notified_; } ); + while (!notified_) { + cv_.wait(l); + } } private: std::mutex mu_; std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag bool notified_; }; + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + // Runs an arbitrary function and then calls Notify() on the passed in // Notification. -template struct FunctionWrapper +template struct FunctionWrapperWithNotification { static void run(Notification* n, Function f, Args... args) { f(args...); - n->Notify(); + if (n) { + n->Notify(); + } + } +}; + +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } } }; -static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) { +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { if (n) { - n->WaitForNotification(); + n->Wait(); } } @@ -203,10 +238,20 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); std::function func = - std::bind(&FunctionWrapper::run, n, f, args...); + std::bind(&FunctionWrapperWithNotification::run, n, f, args...); pool_->Schedule(func); return n; } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, + Args&&... args) const { + std::function func = std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...); + pool_->Schedule(func); + } + template EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { std::function func = std::bind(f, args...); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 54da77bcf..6bbf235cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -127,20 +127,16 @@ class TensorExecutor const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); + device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } if (numblocks * blocksize < size) { EvalRange::run(evaluator, numblocks * blocksize, size); } - for (int i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } - + barrier.Wait(); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fe1dc22ee..489451215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -256,12 +256,11 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back( - device.enqueue(&FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i])); + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, + i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; @@ -271,10 +270,7 @@ struct FullReducer { } else { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + barrier.Wait(); for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } @@ -307,12 +303,12 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard::run, - self, i * blocksize, blocksize, reducer, - &shards[i])); + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); } typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { @@ -322,10 +318,7 @@ struct FullReducer { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + barrier.Wait(); for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } -- cgit v1.2.3 From 3d1e857327a3ef5cfa8b65f2204c28bf405731d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 15:48:28 -0700 Subject: Fixed compilation error --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 4b8eda6bb..23b1765ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) { + explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } -- cgit v1.2.3 From 28e02996df54240d44ead1bf827b867c22a224a9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 16:53:57 -0700 Subject: Merged patch 672 from Justin Lebar: Don't use long doubles with cuda --- test/main.h | 2 ++ unsupported/test/cxx11_tensor_argmax_cuda.cu | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/main.h b/test/main.h index 2797e8623..e5f1a9ad5 100644 --- a/test/main.h +++ b/test/main.h @@ -331,11 +331,13 @@ inline bool test_isApprox(const std::complex& a, const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } +#ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const std::complex& a, const std::complex& b) { return internal::isApprox(a, b, test_precision >()); } inline bool test_isMuchSmallerThan(const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } #endif +#endif #ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const long double& a, const long double& b) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 45311d4f7..41ccbe974 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. +#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -- cgit v1.2.3 From 9642fd7a937942037a3ea0d3c51b799be197782f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 23 Mar 2016 15:37:45 +0100 Subject: Replace all M_PI by EIGEN_PI and add a check to the testsuite. --- test/main.h | 4 ++++ unsupported/Eigen/OpenGLSupport | 4 ++-- unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h | 8 ++------ unsupported/Eigen/src/MatrixFunctions/MatrixPower.h | 4 ++-- unsupported/test/matrix_function.cpp | 4 ++-- unsupported/test/matrix_power.cpp | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/main.h b/test/main.h index e5f1a9ad5..bba5e7570 100644 --- a/test/main.h +++ b/test/main.h @@ -58,6 +58,10 @@ #define isnan(X) please_protect_your_isnan_with_parentheses #define isinf(X) please_protect_your_isinf_with_parentheses #define isfinite(X) please_protect_your_isfinite_with_parentheses +#ifdef M_PI +#undef M_PI +#endif +#define M_PI please_use_EIGEN_PI_instead_of_M_PI #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport index 288c6b0fb..87f50947d 100644 --- a/unsupported/Eigen/OpenGLSupport +++ b/unsupported/Eigen/OpenGLSupport @@ -180,11 +180,11 @@ template void glLoadMatrix(const Transform& rot) { - glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f); + glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f); } inline void glRotate(const Rotation2D& rot) { - glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0); + glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0); } template void glRotate(const RotationBase& rot) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 463d7be0c..e43e86e90 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -11,10 +11,6 @@ #ifndef EIGEN_MATRIX_LOGARITHM #define EIGEN_MATRIX_LOGARITHM -#ifndef M_PI -#define M_PI 3.141592653589793238462643383279503L -#endif - namespace Eigen { namespace internal { @@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI))); - result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y; + int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI))); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; } } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index 1e5a59c55..f37d31c3f 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -298,8 +298,8 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI)); - ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber); + int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 487d5a9b8..9a995f941 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * M_PI) - scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) + scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index baf183d12..8e104ed1e 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -24,7 +24,7 @@ void test2dRotation(double tol) s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / M_PI); + C = Apow(std::ldexp(angle,1) / EIGEN_PI); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); } -- cgit v1.2.3 From 6971146ca9e4b5870404974397a81d125b2418d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:44:52 -0700 Subject: Added more conversion operators for half floats --- Eigen/src/Core/arch/CUDA/Half.h | 55 +++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 921c5bcb2..f997735aa 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -65,20 +65,61 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); struct half : public __half { EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} - // TODO(sesse): Should these conversions be marked as explicit? - EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} - EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast(i))) {} - EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast(d))) {} - EIGEN_DEVICE_FUNC half(bool b) - : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + explicit EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(int i) + : __half(internal::float_to_half_rtne(static_cast(i))) {} + explicit EIGEN_DEVICE_FUNC half(long l) + : __half(internal::float_to_half_rtne(static_cast(l))) {} + explicit EIGEN_DEVICE_FUNC half(long long ll) + : __half(internal::float_to_half_rtne(static_cast(ll))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : __half(internal::float_to_half_rtne(f)) {} + explicit EIGEN_DEVICE_FUNC half(double d) + : __half(internal::float_to_half_rtne(static_cast(d))) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return static_cast(x & 0x7fff); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(internal::half_to_float(*this)); + } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return internal::half_to_float(*this); + return static_cast(internal::half_to_float(*this)); } EIGEN_DEVICE_FUNC half& operator=(const half& other) { -- cgit v1.2.3 From 0e6888260459b31dac1bd3411b0e8f688f6d22a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:46:42 -0700 Subject: Added the ability to divide a half float by an index --- Eigen/src/Core/arch/CUDA/Half.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index f997735aa..08f6005e4 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -234,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { #endif // Emulate support for half floats +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return Eigen::half(static_cast(a) / static_cast(b)); +} + // Conversion routines, including fallbacks for the host or older CUDA. // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of // these in hardware. If we need more performance on older/other CPUs, they are -- cgit v1.2.3 From fc3660285fe326744eb67711126d2764a1f97100 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:56:50 -0700 Subject: Made type conversion explicit --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 4c0433267..b2a9724de 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); #else - half r1 = a.x; - half r2 = a.y; + half r1 = static_cast(a.x); + half r2 = static_cast(a.y); half2 r; r.x = 0; r.x |= r1.x; -- cgit v1.2.3 From 2062ee2d269eac5ff78f70ac3133d0a47c22d9fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 13:39:00 -0700 Subject: Added a test to verify that notifications are working properly --- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_notification.cpp | 72 ++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_notification.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 20048515c..6bd8cfb92 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_argmax) ei_add_test(cxx11_tensor_shuffling) ei_add_test(cxx11_tensor_striding) + ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_random) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp new file mode 100644 index 000000000..961d4edf6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -0,0 +1,72 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Vijay Vasudevan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include +#include +#include "main.h" +#include + +namespace { + +void WaitAndAdd(Eigen::Notification* n, int* counter) { + n->Wait(); + *counter = *counter + 1; +} + +} // namespace + +static void test_notification_single() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + sleep(1); + + // The thread should be waiting for the notification. + VERIFY_IS_EQUAL(counter, 0); + + // Unblock the thread + n.Notify(); + + sleep(1); + + // Verify the counter has been incremented + VERIFY_IS_EQUAL(counter, 1); +} + +// Like test_notification_single() but enqueues multiple threads to +// validate that all threads get notified by Notify(). +static void test_notification_multiple() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + sleep(1); + VERIFY_IS_EQUAL(counter, 0); + n.Notify(); + sleep(1); + VERIFY_IS_EQUAL(counter, 4); +} + +void test_cxx11_tensor_notification() +{ + CALL_SUBTEST(test_notification_single()); + CALL_SUBTEST(test_notification_multiple()); +} -- cgit v1.2.3 From 7168afde5e9c3b05823b939a499c6752d2db10f7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 14:21:04 -0700 Subject: Made the tensor benchmarks compile on MacOS --- bench/tensors/tensor_benchmarks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index d916f787e..a4f97728d 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -333,7 +333,7 @@ template class BenchmarkSuite { #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; - sum_along_dim = 1; + sum_along_dim[0] = 1; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -356,7 +356,7 @@ template class BenchmarkSuite { input_size[1] = n_; const TensorMap, Eigen::Aligned> B( b_, input_size); - const Eigen::array output_size; + Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); -- cgit v1.2.3 From 7a570e50ef0a79d52d5762d086954564afce9d61 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:00:06 -0700 Subject: Fixed contractions of fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 08f6005e4..61131828f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -63,7 +63,7 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); // Class definition. struct half : public __half { - EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} + EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 29b5637e7..cb917bb37 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -134,7 +134,7 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } -/* + void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -181,7 +181,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); -}*/ +} void test_cuda_reductions() { @@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_conversion()); CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); -// CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); } else { -- cgit v1.2.3 From bff8cbad068a74f1a1f7aa0e80e4424c6353c9fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:14:23 -0700 Subject: Removed executable bit from header files --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 7 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Core/AssignEvaluator.h mode change 100755 => 100644 Eigen/src/Core/Assign_MKL.h mode change 100755 => 100644 Eigen/src/Core/ProductEvaluators.h mode change 100755 => 100644 Eigen/src/Core/VectorwiseOp.h mode change 100755 => 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100755 new mode 100644 -- cgit v1.2.3 From 81d340984ae40642eed46cbfb3a817d841d85de1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:15:02 -0700 Subject: Removed executable bit from header files --- Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 9 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealQZ.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100755 => 100644 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100755 => 100644 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100755 => 100644 Eigen/src/SVD/JacobiSVD.h mode change 100755 => 100644 Eigen/src/SparseLU/SparseLU.h diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100755 new mode 100644 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100755 new mode 100644 -- cgit v1.2.3 From 393bc3b16b413598b6c9dcbae722aafb5672d457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:22:15 -0700 Subject: Added comment --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index efe688e50..579519b04 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -13,7 +13,7 @@ // The array class is only available starting with cxx11. Emulate our own here -// if needed. +// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) -- cgit v1.2.3 From 9bc9396e88789e86647227353e10d90d5316fa98 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:30:06 -0700 Subject: Use portable includes --- unsupported/test/cxx11_tensor_notification.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index 961d4edf6..813bc4413 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -10,7 +10,7 @@ #define EIGEN_USE_THREADS #include -#include +#include #include "main.h" #include -- cgit v1.2.3 From 92693b50eb09cdaeecb830a06df6d6c67a369477 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:40:36 -0700 Subject: Fixed compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 6bbf235cc..f71625ae5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -125,7 +125,7 @@ class TensorExecutor int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; + const unsigned int numblocks = static_cast(size / blocksize); Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 489451215..0ce2517d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -253,7 +253,7 @@ struct FullReducer { return; } else { const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); @@ -300,7 +300,7 @@ struct FullReducer { return; } const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); -- cgit v1.2.3 From 41434a8a852e3f2744164bece0487d7e767717b6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:52:38 -0700 Subject: Avoid unnecessary conversions --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f71625ae5..3408933bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -128,7 +128,7 @@ class TensorExecutor const unsigned int numblocks = static_cast(size / blocksize); Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 0ce2517d6..9875601ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -258,7 +258,7 @@ struct FullReducer { Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); } @@ -271,7 +271,7 @@ struct FullReducer { finalShard = reducer.initialize(); } barrier.Wait(); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -305,7 +305,7 @@ struct FullReducer { Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); @@ -319,7 +319,7 @@ struct FullReducer { } barrier.Wait(); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); -- cgit v1.2.3 From 2e4e4cb74de19de0997567a4d058d1522ec7e452 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:57:12 -0700 Subject: Use numext::abs instead of abs to avoid incorrect conversion to integer of the argument --- Eigen/src/Core/SpecialFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index c12e41a7b..37ebb5915 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -576,7 +576,7 @@ struct igammac_impl { pkm1 = pk; qkm2 = qkm1; qkm1 = qk; - if (abs(pk) > big) { + if (numext::abs(pk) > big) { pkm2 *= biginv; pkm1 *= biginv; qkm2 *= biginv; -- cgit v1.2.3 From 044efea965b484fcf13551c8edabdb62c4b4b462 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 20:02:11 -0700 Subject: Made sure that the cxx11_tensor_cuda test can be compiled even without support for cxx11. --- unsupported/test/cxx11_tensor_cuda.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 1964d9e07..4d8465756 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); @@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); @@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_5(test_cuda_igamma()); CALL_SUBTEST_5(test_cuda_igammac()); +#endif } -- cgit v1.2.3 From 0968e925a040d4988f02e8476b5cea8518e5f966 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 24 Mar 2016 18:00:33 -0700 Subject: Updated the benchmarking code to use Eigen::half instead of half --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 49f75472a..35c6f7489 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -12,7 +12,7 @@ StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -41,7 +41,7 @@ BM_FuncGPU(colReduction); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, D1, D2, D3); \ + BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ } \ @@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4); BM_FuncWithKernelDimsGPU(convolution, 4, 7); BM_FuncWithKernelDimsGPU(convolution, 7, 64); BM_FuncWithKernelDimsGPU(convolution, 64, 7); -*/ \ No newline at end of file +*/ -- cgit v1.2.3 From a86c9f037b24312863ad2a74a583369581c6e21a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 24 Mar 2016 18:54:31 -0700 Subject: Fixed compilation error on windows --- unsupported/test/cxx11_tensor_notification.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index 813bc4413..c946007b8 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -9,11 +9,20 @@ #define EIGEN_USE_THREADS -#include #include #include "main.h" #include +#if EIGEN_OS_WIN || EIGEN_OS_WIN64 +#include +void sleep(int seconds) { + Sleep(seconds*1000); +} +#else +#include +#endif + + namespace { void WaitAndAdd(Eigen::Notification* n, int* counter) { -- cgit v1.2.3 From d94f6ba9659f8c953caaff854552070ce149958b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 11:02:56 -0700 Subject: Started to model the cost of divisions more accurately. --- Eigen/src/Core/NumTraits.h | 17 +++++++++++++++++ Eigen/src/Core/functors/BinaryFunctors.h | 12 +++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 7ddb4a867..b7b5e7d22 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -60,6 +60,23 @@ template struct GenericNumTraits MulCost = 1 }; + // Division is messy but important, because it is expensive and throughput + // varies significantly. The following numbers are based on min division + // throughput on Haswell. + template + struct Div { + enum { +#ifdef EIGEN_VECTORIZE_AVX + AVX = true, +#else + AVX = false, +#endif + Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)): + Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8 + }; + }; + + typedef T Real; typedef typename internal::conditional< IsInteger, diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 5cdfff845..d04323bb0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -238,7 +238,13 @@ template struct scalar_hypot_op { }; template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess=0 }; + enum + { + Cost = 3 * NumTraits::AddCost + + 2 * NumTraits::MulCost + + 2 * NumTraits::template Div::Cost, + PacketAccess = false + }; }; /** \internal @@ -564,6 +570,10 @@ struct scalar_inverse_mult_op { { return internal::pdiv(pset1(m_other),a); } Scalar m_other; }; +template +struct functor_traits > +{ enum { PacketAccess = packet_traits::HasDiv, Cost = NumTraits::template Div::Cost }; }; + } // end namespace internal -- cgit v1.2.3 From 65716e99a5763f536257eb1dd047f34f8172f816 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 11:13:53 -0700 Subject: Improved the cost estimate of the quotient op --- Eigen/src/Core/functors/BinaryFunctors.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index d04323bb0..e28fecfd0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -303,9 +303,10 @@ template struct scalar_quotient_op { }; template struct functor_traits > { + typedef typename scalar_quotient_op::result_type result_type; enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost), // rough estimate! - PacketAccess = scalar_quotient_op::Vectorizable + PacketAccess = scalar_quotient_op::Vectorizable, + Cost = NumTraits::template Div::Cost }; }; -- cgit v1.2.3 From 74f91ed06c615fc7d875bd30cb72ea5e08504be2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 17:21:56 -0700 Subject: Improved support for integer modulo --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index c71a30d21..eb0c8d1ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,7 +25,20 @@ struct scalar_mod_op { }; template struct functor_traits > -{ enum { Cost = 2 * NumTraits::MulCost, PacketAccess = false }; }; +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; /** \internal -- cgit v1.2.3 From ed6b9d08f1ac5b32a1097484a7a7e4672648ff12 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sun, 27 Mar 2016 18:47:49 -0400 Subject: some primitives ported, but missing intrinsics and crash with asm() are a problem --- Eigen/Core | 10 ++++++++++ test/packetmath.cpp | 32 +++++++++++++++++--------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 8428c51e4..cc4ac5843 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -194,6 +194,10 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_NEON #include + #elif (defined __s390x__ && defined __VEC__) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ZVECTOR + #include #endif #endif @@ -267,6 +271,8 @@ inline static const char *SimdInstructionSetsInUse(void) { return "VSX"; #elif defined(EIGEN_VECTORIZE_NEON) return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; #else return "None"; #endif @@ -329,6 +335,10 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/PacketMath.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_ZVECTOR + #include "src/Core/arch/ZVector/PacketMath.h" +// #include "src/Core/arch/ZVector/MathFunctions.h" +// #include "src/Core/arch/ZVector/Complex.h" #endif #include "src/Core/arch/CUDA/Half.h" diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9e89f85c1..9d49ec4f2 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -170,14 +170,14 @@ template void packetmath() CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); - for(int offset=0;offset<3;++offset) +/* for(int offset=0;offset<3;++offset) { for (int i=0; i(data1[offset])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); - } - + }*/ +/* { for (int i=0; i void packetmath() internal::pstore(data2+1*PacketSize, A1); VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); } - + */ VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload(data1))) && "internal::pfirst"); - + if(PacketSize>1) { for(int offset=0;offset<4;++offset) @@ -212,6 +212,7 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "ploaddup"); } } + if(PacketSize>2) { for(int offset=0;offset<4;++offset) @@ -222,7 +223,7 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad"); } } - +/* ref[0] = 0; for (int i=0; i void packetmath() for (int i = 0; i < PacketSize; ++i) { VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); } - } + }*/ } - +/* template void packetmath_real() { using std::abs; @@ -431,6 +432,7 @@ template void packetmath_real() VERIFY((numext::isnan)(data2[0])); VERIFY((numext::isnan)(data2[1])); #endif + } } @@ -528,7 +530,7 @@ template void packetmath_complex() internal::pstore(pval,internal::pcplxflip(internal::pload(data1))); VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip"); } -} +}*/ template void packetmath_scatter_gather() { @@ -573,9 +575,9 @@ void test_packetmath() CALL_SUBTEST_1( packetmath() ); CALL_SUBTEST_2( packetmath() ); CALL_SUBTEST_3( packetmath() ); - CALL_SUBTEST_4( packetmath >() ); - CALL_SUBTEST_5( packetmath >() ); - +/* CALL_SUBTEST_4( packetmath >() ); + CALL_SUBTEST_5( packetmath >() );*/ +/* CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); CALL_SUBTEST_3( packetmath_notcomplex() ); @@ -584,12 +586,12 @@ void test_packetmath() CALL_SUBTEST_2( packetmath_real() ); CALL_SUBTEST_4( packetmath_complex >() ); - CALL_SUBTEST_5( packetmath_complex >() ); + CALL_SUBTEST_5( packetmath_complex >() );*/ CALL_SUBTEST_1( packetmath_scatter_gather() ); CALL_SUBTEST_2( packetmath_scatter_gather() ); CALL_SUBTEST_3( packetmath_scatter_gather() ); - CALL_SUBTEST_4( packetmath_scatter_gather >() ); - CALL_SUBTEST_5( packetmath_scatter_gather >() ); +/* CALL_SUBTEST_4( packetmath_scatter_gather >() ); + CALL_SUBTEST_5( packetmath_scatter_gather >() );*/ } } -- cgit v1.2.3 From 01e7298fe605b24ad71594f31df6df5c23b90fee Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 28 Mar 2016 10:58:02 -0400 Subject: actually include ZVector files, passes most basic tests (float still fails) --- Eigen/src/Core/arch/ZVector/CMakeLists.txt | 6 + Eigen/src/Core/arch/ZVector/PacketMath.h | 935 +++++++++++++++++++++++++++++ test/packetmath.cpp | 26 +- 3 files changed, 954 insertions(+), 13 deletions(-) create mode 100644 Eigen/src/Core/arch/ZVector/CMakeLists.txt create mode 100755 Eigen/src/Core/arch/ZVector/PacketMath.h diff --git a/Eigen/src/Core/arch/ZVector/CMakeLists.txt b/Eigen/src/Core/arch/ZVector/CMakeLists.txt new file mode 100644 index 000000000..5eb0957eb --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_ZVector_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_ZVector_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/ZVector COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h new file mode 100755 index 000000000..c786aeec0 --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -0,0 +1,935 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Konstantinos Margaritis +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_ZVECTOR_H +#define EIGEN_PACKET_MATH_ZVECTOR_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + +// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +typedef __vector float Packet4f; +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; + +typedef union { + float f[4]; + double d[2]; + int i[4]; + Packet4f v4f; + Packet4i v4i; + Packet2d v2d; +} Packet; + +// We don't want to write the same code all the time, but we need to reuse the constants +// and it doesn't really work to declare them global, so we define macros instead + +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = reinterpret_cast(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = reinterpret_cast(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = reinterpret_cast(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + +// These constants are endian-agnostic +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} + +static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); + +static Packet4f p4f_ONE = { 1.0, 1.0, 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; + +/* +static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} +*/ +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); + +static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; + +// Mask alignment +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 + +#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) + +// Handle endianness properly while loading constants +// Define global static constants: +/* +static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);*/ +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +/* +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; + +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/ +static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; + +//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; + + +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 0 + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + // FIXME check the Has* + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 0 + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 1, + + // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; + +inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) +{ + union { + Packet16uc v; + unsigned char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << (int)vt.n[i] << ", "; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) +{ + union { + Packet4f v; + float n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +{ + union { + Packet4i v; + int n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +{ + union { + Packet4ui v; + unsigned int n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + union { + Packet2d v; + double n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 4)); break; + case 2: + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); break; + case 3: + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 12)); break; + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); + } +}; + +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} + +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} + +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + // FIXME: Check if proper intrinsic exists + Packet res; + res.f[0] = from; + res.v4f = reinterpret_cast(vec_splats(res.i[0])); + return res.v4f; +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + Packet2d res; + res = vec_splats(from); + return res; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 0)); + a1 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 1)); + a2 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 2)); + a3 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 3)); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a3 = pload(a+2); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +/* +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub(p4f_ZERO, a); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub(p4i_ZERO, a); } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub(p2d_ZERO, a); } +*/ +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + return a; +} + +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) +{ + return reinterpret_cast(__builtin_s390_vmalf(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ + return vec_madd(a, b, c); +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + return pmadd(a,b,p4f_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) +{ + return pmadd(a,b,p4i_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) +{ + return pmadd(a,b,p2d_ZERO); +} + +/*template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#ifndef __VSX__ // VSX actually provides a div instruction + Packet4f t, y_0, y_1; + + // Altivec does not offer a divide instruction, we have to do a reciprocal approximation + y_0 = vec_re(b); + + // Do one Newton-Raphson iteration to get the needed accuracy + t = vec_nmsub(y_0, b, p4f_ONE); + y_1 = vec_madd(y_0, t, y_0); + + return vec_madd(a, y_1, p4f_ZERO); +#else + return vec_div(a, b); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) +{ eigen_assert(false && "packet integer division are not supported by AltiVec"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } +*/ + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return pmadd(a, p4f_ONE, b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return pmadd(a, p4i_ONE, b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return pmadd(a, p2d_ONE, b); } + +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } + + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return a; /*vec_min(a, b);*/ } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return a; /*vec_max(a, b);*/ } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } + + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + return reinterpret_cast(vec_and(reinterpret_cast(a), reinterpret_cast(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) +{ + return vec_and(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) +{ + return vec_and(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + return reinterpret_cast(vec_or(reinterpret_cast(a), reinterpret_cast(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) +{ + return vec_or(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) +{ + return vec_or(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + return reinterpret_cast(vec_xor(reinterpret_cast(a), reinterpret_cast(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) +{ + return vec_xor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) +{ + return vec_and(a, vec_nor(b, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) +{ + return vec_xor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + return pand(a, reinterpret_cast(vec_nor(reinterpret_cast(b), reinterpret_cast(b)))); +} + +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) +{ + return pand(a, vec_nor(b, b)); +} + +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p; + if((ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return (Packet4f) vec_perm((Packet16uc)(p), (Packet16uc)(p), p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i p; + if((ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + Packet2d p; + if((ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) +{ + EIGEN_ZVECTOR_PREFETCH(addr); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) +{ + EIGEN_ZVECTOR_PREFETCH(addr); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) +{ + EIGEN_ZVECTOR_PREFETCH(addr); +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return a; /*vec_abs(a);*/ } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f b, sum; + b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); + sum = padd(a, b); + b = reinterpret_cast(vec_sld(reinterpret_cast(sum), reinterpret_cast(sum), 4)); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); + sum = padd(a, b); + return pfirst(sum); +} + + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[2]))); + v[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[2]))); + v[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[3]))); + v[3] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[3]))); + // Get the resulting vectors + sum[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[0]), reinterpret_cast(v[2]))); + sum[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[0]), reinterpret_cast(v[2]))); + sum[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[1]), reinterpret_cast(v[3]))); + sum[3] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[1]), reinterpret_cast(v[3]))); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); + v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); + + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); + + return sum; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); + return pfirst(pmul(prod, reinterpret_cast(vec_sld(reinterpret_cast(prod), reinterpret_cast(prod), 4)))); +} + +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 4))); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + Packet4i b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 4))); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + Packet4i b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0, t1, t2, t3; + t0 = reinterpret_cast(vec_mergeh(reinterpret_cast(kernel.packet[0]), reinterpret_cast(kernel.packet[2]))); + t1 = reinterpret_cast(vec_mergel(reinterpret_cast(kernel.packet[0]), reinterpret_cast(kernel.packet[2]))); + t2 = reinterpret_cast(vec_mergeh(reinterpret_cast(kernel.packet[1]), reinterpret_cast(kernel.packet[3]))); + t3 = reinterpret_cast(vec_mergel(reinterpret_cast(kernel.packet[1]), reinterpret_cast(kernel.packet[3]))); + kernel.packet[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(t0), reinterpret_cast(t2))); + kernel.packet[1] = reinterpret_cast(vec_mergel(reinterpret_cast(t0), reinterpret_cast(t2))); + kernel.packet[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(t1), reinterpret_cast(t3))); + kernel.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(t1), reinterpret_cast(t3))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet2d t0, t1; + t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_ZVECTOR_H diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9d49ec4f2..7309a85f8 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -170,14 +170,14 @@ template void packetmath() CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); -/* for(int offset=0;offset<3;++offset) + for(int offset=0;offset<3;++offset) { for (int i=0; i(data1[offset])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); - }*/ -/* + } + { for (int i=0; i void packetmath() internal::pstore(data2+1*PacketSize, A1); VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); } - */ + VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload(data1))) && "internal::pfirst"); if(PacketSize>1) @@ -223,12 +223,12 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad"); } } -/* + ref[0] = 0; for (int i=0; i(data1)), refvalue) && "internal::predux"); - + { for (int i=0; i<4; ++i) ref[i] = 0; @@ -284,9 +284,9 @@ template void packetmath() for (int i = 0; i < PacketSize; ++i) { VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); } - }*/ + } } -/* + template void packetmath_real() { using std::abs; @@ -471,7 +471,7 @@ template void packetmath_notcomplex() internal::pstore(data2, internal::plset(data1[0])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); } - +/* template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { typedef internal::packet_traits PacketTraits; @@ -577,15 +577,15 @@ void test_packetmath() CALL_SUBTEST_3( packetmath() ); /* CALL_SUBTEST_4( packetmath >() ); CALL_SUBTEST_5( packetmath >() );*/ -/* + CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); CALL_SUBTEST_3( packetmath_notcomplex() ); - CALL_SUBTEST_1( packetmath_real() ); - CALL_SUBTEST_2( packetmath_real() ); +/* CALL_SUBTEST_1( packetmath_real() ); + CALL_SUBTEST_2( packetmath_real() );*/ - CALL_SUBTEST_4( packetmath_complex >() ); +/* CALL_SUBTEST_4( packetmath_complex >() ); CALL_SUBTEST_5( packetmath_complex >() );*/ CALL_SUBTEST_1( packetmath_scatter_gather() ); -- cgit v1.2.3 From 78f83d6f6aa873b9dc128e83c4fc63d0f384fac1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Mar 2016 09:18:04 -0700 Subject: Prevent potential overflow. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 3408933bf..4f4e07aaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -132,7 +132,7 @@ class TensorExecutor device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } - if (numblocks * blocksize < size) { + if (static_cast(numblocks) * blocksize < size) { EvalRange::run(evaluator, numblocks * blocksize, size); } -- cgit v1.2.3 From 1bc81f78895effe972ef8df5a138d267a74295fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Mar 2016 09:21:04 -0700 Subject: Fixed compilation warnings on arm --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 9875601ba..00f870328 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -254,7 +254,7 @@ struct FullReducer { } else { const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); + eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); @@ -264,7 +264,7 @@ struct FullReducer { } typename Self::CoeffReturnType finalShard; - if (numblocks * blocksize < num_coeffs) { + if (static_cast(numblocks) * blocksize < num_coeffs) { finalShard = InnerMostDimReducer::reduce( self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); } else { @@ -301,7 +301,7 @@ struct FullReducer { } const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); + eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); @@ -311,7 +311,7 @@ struct FullReducer { &shards[i]); } typename Self::CoeffReturnType finalShard; - if (numblocks * blocksize < num_coeffs) { + if (static_cast(numblocks) * blocksize < num_coeffs) { finalShard = InnerMostDimReducer::reduce( self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); } else { -- cgit v1.2.3 From 6772f653c33bd78c25623619581836bac1d1d20a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Mar 2016 10:01:04 -0700 Subject: Made it possible to customize the threadpool --- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 79 ++++++++++++++++------ 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 23b1765ba..cd3dd214b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -24,36 +24,40 @@ class ThreadPoolInterface { // The implementation of the ThreadPool type ensures that the Schedule method // runs the functions it is provided in FIFO order when the scheduling is done // by a single thread. -class ThreadPool : public ThreadPoolInterface { +// Environment provides a way to create threads and also allows to intercept +// task submission and execution. +template +class ThreadPoolTempl : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { + explicit ThreadPoolTempl(int num_threads, Environment env = Environment()) + : env_(env), threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { - threads_.push_back(new std::thread([this]() { WorkerLoop(); })); + threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); } } // Wait until all scheduled work has finished and then destroy the // set of threads. - ~ThreadPool() - { + ~ThreadPoolTempl() { { // Wait for all work to get done. std::unique_lock l(mu_); - empty_.wait(l, [this]() { return pending_.empty(); }); + while (!pending_.empty()) { + empty_.wait(l); + } exiting_ = true; // Wakeup all waiters. for (auto w : waiters_) { w->ready = true; - w->work = nullptr; + w->task.f = nullptr; w->cv.notify_one(); } } // Wait for threads to finish. for (auto t : threads_) { - t->join(); delete t; } } @@ -61,14 +65,15 @@ class ThreadPool : public ThreadPoolInterface { // Schedule fn() for execution in the pool of threads. The functions are // executed in the order in which they are scheduled. void Schedule(std::function fn) { + Task t = env_.CreateTask(std::move(fn)); std::unique_lock l(mu_); if (waiters_.empty()) { - pending_.push_back(fn); + pending_.push_back(std::move(t)); } else { Waiter* w = waiters_.back(); waiters_.pop_back(); w->ready = true; - w->work = fn; + w->task = std::move(t); w->cv.notify_one(); } } @@ -77,46 +82,76 @@ class ThreadPool : public ThreadPoolInterface { void WorkerLoop() { std::unique_lock l(mu_); Waiter w; + Task t; while (!exiting_) { - std::function fn; if (pending_.empty()) { // Wait for work to be assigned to me w.ready = false; waiters_.push_back(&w); - w.cv.wait(l, [&w]() { return w.ready; }); - fn = w.work; - w.work = nullptr; + while (!w.ready) { + w.cv.wait(l); + } + t = w.task; + w.task.f = nullptr; } else { // Pick up pending work - fn = pending_.front(); + t = std::move(pending_.front()); pending_.pop_front(); if (pending_.empty()) { empty_.notify_all(); } } - if (fn) { + if (t.f) { mu_.unlock(); - fn(); + env_.ExecuteTask(t); + t.f = nullptr; mu_.lock(); } } } private: + typedef typename Environment::Task Task; + typedef typename Environment::EnvThread Thread; + struct Waiter { std::condition_variable cv; - std::function work; + Task task; bool ready; }; + Environment env_; std::mutex mu_; - MaxSizeVector threads_; // All threads - MaxSizeVector waiters_; // Stack of waiting threads. - std::deque> pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() + MaxSizeVector threads_; // All threads + MaxSizeVector waiters_; // Stack of waiting threads. + std::deque pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; }; +struct StlThreadEnvironment { + struct Task { + std::function f; + }; + + // EnvThread constructor must start the thread, + // destructor must join the thread. + class EnvThread { + public: + EnvThread(std::function f) : thr_(f) {} + ~EnvThread() { thr_.join(); } + + private: + std::thread thr_; + }; + + EnvThread* CreateThread(std::function f) { return new EnvThread(f); } + Task CreateTask(std::function f) { return Task{std::move(f)}; } + void ExecuteTask(const Task& t) { t.f(); } +}; + +typedef ThreadPoolTempl ThreadPool; + // Barrier is an object that allows one or more threads to wait until // Notify has been called a specified number of times. -- cgit v1.2.3 From c38295f0a03edcd9f8325fcb08484eb579b7841f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Mar 2016 15:53:02 -0700 Subject: Added support for fmod --- Eigen/src/Core/MathFunctions.h | 22 ++++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 14 ++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index ec75175ca..6ffad6c29 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1053,6 +1053,28 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } #endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T fmod(const T& a, const T& b) { + EIGEN_USING_STD_MATH(floor); + return fmod(a, b); +} + +#ifdef __CUDACC__ +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float fmod(const float& a, const float& b) { + return ::fmodf(a, b); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double fmod(const double& a, const double& b) { + return ::fmod(a, b); +} +#endif + } // end namespace numext namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index eb0c8d1ce..b7c13f67f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -40,6 +40,20 @@ template struct functor_traits > { enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; +template +struct scalar_fmod_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false }; +}; + /** \internal * \brief Template functor to compute the sigmoid of a scalar -- cgit v1.2.3 From e02b784ec3c4f1932d7b4c3805c2fc56e35f8c3f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 29 Mar 2016 09:20:36 -0700 Subject: Added support for standard mathematical functions and trancendentals(such as exp, log, abs, ...) on fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 61131828f..6c412159c 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -361,9 +361,6 @@ static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { } // end namespace Eigen // Standard mathematical functions and trancendentals. - -namespace std { - static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; @@ -375,6 +372,36 @@ static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { return Eigen::half(::logf(float(a))); } +static inline EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} +static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { + return (Eigen::numext::isnan)(a); +} +static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (Eigen::numext::isinf)(a); +} +static inline EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} + + +namespace std { + +// Import the standard mathematical functions and trancendentals into the +// into the std namespace. +using ::abs; +using ::exp; +using ::log; +using ::sqrt; +using ::floor; +using ::ceil; } // end namespace std -- cgit v1.2.3 From 7b7d2a9fa52fb4537849c69a0b193a9284b42bb1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 29 Mar 2016 11:50:17 -0700 Subject: Use false instead of 0 as the expected value of a boolean --- unsupported/test/cxx11_meta.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index 62db0e71e..ecac3add1 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -250,8 +250,8 @@ static void test_is_same_gf() { VERIFY((!is_same_gf::value)); VERIFY((!!is_same_gf::value)); - VERIFY_IS_EQUAL((!!is_same_gf::global_flags), 0); - VERIFY_IS_EQUAL((!!is_same_gf::global_flags), 0); + VERIFY_IS_EQUAL((!!is_same_gf::global_flags), false); + VERIFY_IS_EQUAL((!!is_same_gf::global_flags), false); } static void test_apply_op() -- cgit v1.2.3 From 1841d6d4c349e1e1d64820a26aaab5df30750400 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 29 Mar 2016 13:29:34 -0700 Subject: Added missing cuda template specializations for numext::ceil --- Eigen/src/Core/MathFunctions.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 6ffad6c29..000cafee7 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -962,6 +962,15 @@ T (ceil)(const T& x) return ceil(x); } +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float ceil(const float &x) { return ::ceilf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double ceil(const double &x) { return ::ceil(x); } +#endif + + /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ inline int log2(int x) -- cgit v1.2.3 From 09ad31aa855055b3ec423d2638bcb2345524ad2f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 Mar 2016 22:33:57 +0200 Subject: Add regression test for nesting type handling in blas_traits --- test/product.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/product.h b/test/product.h index 45bb64958..27976a4ae 100644 --- a/test/product.h +++ b/test/product.h @@ -188,4 +188,12 @@ template void product(const MatrixType& m) // CwiseUnaryOp VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z); } + + // regression for blas_trais + { + VERIFY_IS_APPROX(square * (square*square).transpose(), square * square.transpose() * square.transpose()); + VERIFY_IS_APPROX(square * (-(square*square)), -square * square * square); + VERIFY_IS_APPROX(square * (s1*(square*square)), s1 * square * square * square); + VERIFY_IS_APPROX(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate()); + } } -- cgit v1.2.3 From 56df5ef1d777550148a6d16eacec09b2d9c16a9a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 29 Mar 2016 15:03:38 -0700 Subject: Attempt to fix the formatting of the README --- unsupported/Eigen/CXX11/src/Tensor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 407485090..e60fdd413 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -11,7 +11,7 @@ You can manipulate a tensor with one of the following classes. They all are in the namespace ```::Eigen.``` -### Class Tensor<data_type, rank> +### Class Tensor; This is the class to use to create a tensor and allocate memory for it. The class is templatized with the tensor datatype, such as float or int, and the -- cgit v1.2.3 From aa45ad2aac1f5a26e4258c58aa1712035351b536 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 29 Mar 2016 15:06:13 -0700 Subject: Fixed the formatting of the README. --- unsupported/Eigen/CXX11/src/Tensor/README.md | 138 +++++++++++++-------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index e60fdd413..eeca2f69e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -11,7 +11,7 @@ You can manipulate a tensor with one of the following classes. They all are in the namespace ```::Eigen.``` -### Class Tensor; +### Class Tensor This is the class to use to create a tensor and allocate memory for it. The class is templatized with the tensor datatype, such as float or int, and the @@ -21,7 +21,7 @@ matrix. Tensors of this class are resizable. For example, if you assign a tensor of a different size to a Tensor, that tensor is resized to match its new value. -#### Constructor Tensor<data_type, rank>(size0, size1, ...) +#### Constructor Tensor(size0, size1, ...) Constructor for a Tensor. The constructor must be passed ```rank``` integers indicating the sizes of the instance along each of the the ```rank``` @@ -34,18 +34,18 @@ dimensions. // Resize t_3d by assigning a tensor of different sizes, but same rank. t_3d = Tensor(3, 4, 3); -#### Constructor Tensor<data_type, rank>(size_array) +#### Constructor Tensor(size_array) Constructor where the sizes for the constructor are specified as an array of values instead of an explicitly list of parameters. The array type to use is -```Eigen::array<Eigen::Index>```. The array can be constructed automatically +```Eigen::array```. The array can be constructed automatically from an initializer list. // Create a tensor of strings of rank 2 with sizes 5, 7. Tensor t_2d({5, 7}); -### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> +### Class TensorFixedSize> Class to use for tensors of fixed size, where the size is known at compile time. Fixed sized tensors can provide very fast computations because all their @@ -57,7 +57,7 @@ tensor data is held onto the stack and does not cause heap allocation and free. // Create a 4 x 3 tensor of floats. TensorFixedSize> t_4x3; -### Class TensorMap<Tensor<data_type, rank>> +### Class TensorMap> This is the class to use to create a tensor on top of memory allocated and owned by another part of your code. It allows to view any piece of allocated @@ -67,7 +67,7 @@ data are stored. A TensorMap is not resizable because it does not own the memory where its data are stored. -#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) +#### Constructor TensorMap>(data, size0, size1, ...) Constructor for a Tensor. The constructor must be passed a pointer to the storage for the data, and "rank" size attributes. The storage has to be @@ -93,7 +93,7 @@ See Assigning to a TensorRef below. ## Accessing Tensor Elements -#### <data_type> tensor(index0, index1...) +#### tensor(index0, index1...) Return the element at position ```(index0, index1...)``` in tensor ```tensor```. You must pass as many parameters as the rank of ```tensor```. @@ -175,7 +175,7 @@ the following code computes the elementwise addition of two tensors: While the code above looks easy enough, it is important to understand that the expression ```t1 + t2``` is not actually adding the values of the tensors. The expression instead constructs a "tensor operator" object of the class -TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors +TensorCwiseBinaryOp, which has references to the tensors ```t1``` and ```t2```. This is a small C++ object that knows how to add ```t1``` and ```t2```. It is only when the value of the expression is assigned to the tensor ```t3``` that the addition is actually performed. Technically, @@ -452,24 +452,24 @@ memory for tensors with cuda. In the documentation of the tensor methods and Operation we mention datatypes that are tensor-type specific: -#### <Tensor-Type>::Dimensions +#### ::Dimensions Acts like an array of ints. Has an ```int size``` attribute, and can be indexed like an array to access individual values. Used to represent the dimensions of a tensor. See ```dimensions()```. -#### <Tensor-Type>::Index +#### ::Index Acts like an ```int```. Used for indexing tensors along their dimensions. See ```operator()```, ```dimension()```, and ```size()```. -#### <Tensor-Type>::Scalar +#### ::Scalar Represents the datatype of individual tensor elements. For example, for a ```Tensor```, ```Scalar``` is the type ```float```. See ```setConstant()```. -#### <Operation> +#### We use this pseudo type to indicate that a tensor Operation is returned by a method. We indicate in the text the type and dimensions of the tensor that the @@ -602,7 +602,7 @@ You can use one of the methods below to initialize the tensor memory. These have an immediate effect on the tensor and return the tensor itself as a result. These are not tensor Operations which delay evaluation. -### <Tensor-Type> setConstant(const Scalar& val) +### setConstant(const Scalar& val) Sets all elements of the tensor to the constant value ```val```. ```Scalar``` is the type of data stored in the tensor. You can pass any value that is @@ -630,7 +630,7 @@ has a copy constructor and an ```operator=()```: yolo yolo yolo -### <Tensor-Type> setZero() +### setZero() Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. Returns the tensor itself in case you want to chain another call. @@ -644,7 +644,7 @@ Returns the tensor itself in case you want to chain another call. 0 0 0 0 -### <Tensor-Type> setValues({..initializer_list}) +### setValues({..initializer_list}) Fills the tensor with explicit values specified in a std::initializer_list. The type of the initializer list depends on the type and rank of the tensor. @@ -680,7 +680,7 @@ code only sets the values of the first row of the tensor. 10 20 30 1000 1000 1000 -### <Tensor-Type> setRandom() +### setRandom() Fills the tensor with random values. Returns the tensor itself in case you want to chain another call. @@ -775,7 +775,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a tensor. See "Controlling when Expression are Evaluated" for more details about their evaluation. -### <Operation> constant(const Scalar& val) +### constant(const Scalar& val) Returns a tensor of the same type and dimensions as the original tensor but where all elements have the value ```val```. @@ -803,7 +803,7 @@ tensor, or multiply every element of a tensor by a scalar. 0.6 0.6 0.6 0.6 0.6 0.6 -### <Operation> random() +### random() Returns a tensor of the same type and dimensions as the current tensor but where all elements have random values. @@ -833,7 +833,7 @@ All these operations take a single input tensor as argument and return a tensor of the same type and dimensions as the tensor to which they are applied. The requested operations are applied to each element independently. -### <Operation> operator-() +### operator-() Returns a tensor of the same type and dimensions as the original tensor containing the opposite values of the original tensor. @@ -852,42 +852,42 @@ containing the opposite values of the original tensor. -1 -1 -1 -1 -1 -1 -### <Operation> sqrt() +### sqrt() Returns a tensor of the same type and dimensions as the original tensor containing the square roots of the original tensor. -### <Operation> rsqrt() +### rsqrt() Returns a tensor of the same type and dimensions as the original tensor containing the inverse square roots of the original tensor. -### <Operation> square() +### square() Returns a tensor of the same type and dimensions as the original tensor containing the squares of the original tensor values. -### <Operation> inverse() +### inverse() Returns a tensor of the same type and dimensions as the original tensor containing the inverse of the original tensor values. -### <Operation> exp() +### exp() Returns a tensor of the same type and dimensions as the original tensor containing the exponential of the original tensor. -### <Operation> log() +### log() Returns a tensor of the same type and dimensions as the original tensor containing the natural logarithms of the original tensor. -### <Operation> abs() +### abs() Returns a tensor of the same type and dimensions as the original tensor containing the absolute values of the original tensor. -### <Operation> pow(Scalar exponent) +### pow(Scalar exponent) Returns a tensor of the same type and dimensions as the original tensor containing the coefficients of the original tensor to the power of the @@ -914,17 +914,17 @@ cubic roots of an int Tensor: 0 1 2 3 4 5 -### <Operation> operator * (Scalar scale) +### operator * (Scalar scale) Multiplies all the coefficients of the input tensor by the provided scale. -### <Operation> cwiseMax(Scalar threshold) +### cwiseMax(Scalar threshold) TODO -### <Operation> cwiseMin(Scalar threshold) +### cwiseMin(Scalar threshold) TODO -### <Operation> unaryExpr(const CustomUnaryOp& func) +### unaryExpr(const CustomUnaryOp& func) TODO @@ -936,39 +936,39 @@ dimensions as the tensors to which they are applied, and unless otherwise specified it is also of the same type. The requested operations are applied to each pair of elements independently. -### <Operation> operator+(const OtherDerived& other) +### operator+(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise sums of the inputs. -### <Operation> operator-(const OtherDerived& other) +### operator-(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise differences of the inputs. -### <Operation> operator*(const OtherDerived& other) +### operator*(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise products of the inputs. -### <Operation> operator/(const OtherDerived& other) +### operator/(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise quotients of the inputs. This operator is not supported for integer types. -### <Operation> cwiseMax(const OtherDerived& other) +### cwiseMax(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise maximums of the inputs. -### <Operation> cwiseMin(const OtherDerived& other) +### cwiseMin(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise mimimums of the inputs. -### <Operation> Logical operators +### Logical operators The following logical operators are supported as well: @@ -1119,50 +1119,50 @@ one-dimension tensor with a single value. 276 -### <Operation> sum(const Dimensions& new_dims) -### <Operation> sum() +### sum(const Dimensions& new_dims) +### sum() Reduce a tensor using the sum() operator. The resulting values are the sum of the reduced values. -### <Operation> mean(const Dimensions& new_dims) -### <Operation> mean() +### mean(const Dimensions& new_dims) +### mean() Reduce a tensor using the mean() operator. The resulting values are the mean of the reduced values. -### <Operation> maximum(const Dimensions& new_dims) -### <Operation> maximum() +### maximum(const Dimensions& new_dims) +### maximum() Reduce a tensor using the maximum() operator. The resulting values are the largest of the reduced values. -### <Operation> minimum(const Dimensions& new_dims) -### <Operation> minimum() +### minimum(const Dimensions& new_dims) +### minimum() Reduce a tensor using the minimum() operator. The resulting values are the smallest of the reduced values. -### <Operation> prod(const Dimensions& new_dims) -### <Operation> prod() +### prod(const Dimensions& new_dims) +### prod() Reduce a tensor using the prod() operator. The resulting values are the product of the reduced values. -### <Operation> all(const Dimensions& new_dims) -### <Operation> all() +### all(const Dimensions& new_dims) +### all() Reduce a tensor using the all() operator. Casts tensor to bool and then checks whether all elements are true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### <Operation> any(const Dimensions& new_dims) -### <Operation> any() +### any(const Dimensions& new_dims) +### any() Reduce a tensor using the any() operator. Casts tensor to bool and then checks whether any element is true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) +### reduce(const Dimensions& new_dims, const Reducer& reducer) Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` in TensorFunctors.h for information on how to implement a reduction operator. @@ -1170,7 +1170,7 @@ in TensorFunctors.h for information on how to implement a reduction operator. ## Convolutions -### <Operation> convolve(const Kernel& kernel, const Dimensions& dims) +### convolve(const Kernel& kernel, const Dimensions& dims) Returns a tensor that is the output of the convolution of the input tensor with the kernel, along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor @@ -1213,7 +1213,7 @@ These operations return a Tensor with different dimensions than the original Tensor. They can be used to access slices of tensors, see them with different dimensions, or pad tensors with additional data. -### <Operation> reshape(const Dimensions& new_dims) +### reshape(const Dimensions& new_dims) Returns a view of the input tensor that has been reshaped to the specified new dimensions. The argument new_dims is an array of Index values. The @@ -1292,7 +1292,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to the reshape view of b. -### <Operation> shuffle(const Shuffle& shuffle) +### shuffle(const Shuffle& shuffle) Returns a copy of the input tensor whose dimensions have been reordered according to the specified permutation. The argument shuffle @@ -1333,7 +1333,7 @@ Let's rewrite the previous example to take advantage of this feature: output.shuffle({2, 0, 1}) = input; -### <Operation> stride(const Strides& strides) +### stride(const Strides& strides) Returns a view of the input tensor that strides (skips stride-1 elements) along each of the dimensions. The argument strides is an @@ -1359,7 +1359,7 @@ It is possible to assign a tensor to a stride: output.stride({2, 3, 4}) = input; -### <Operation> slice(const StartIndices& offsets, const Sizes& extents) +### slice(const StartIndices& offsets, const Sizes& extents) Returns a sub-tensor of the given tensor. For each dimension i, the slice is made of the coefficients stored between offset[i] and offset[i] + extents[i] in @@ -1385,7 +1385,7 @@ the input tensor. 600 700 -### <Operation> chip(const Index offset, const Index dim) +### chip(const Index offset, const Index dim) A chip is a special kind of slice. It is the subtensor at the given offset in the dimension dim. The returned tensor has one fewer dimension than the input @@ -1436,7 +1436,7 @@ lvalue. For example: 0 0 0 -### <Operation> reverse(const ReverseDimensions& reverse) +### reverse(const ReverseDimensions& reverse) Returns a view of the input tensor that reverses the order of the coefficients along a subset of the dimensions. The argument reverse is an array of boolean @@ -1466,7 +1466,7 @@ of a 2D tensor: 0 100 200 -### <Operation> broadcast(const Broadcast& broadcast) +### broadcast(const Broadcast& broadcast) Returns a view of the input tensor in which the input is replicated one to many times. @@ -1490,11 +1490,11 @@ made in each of the dimensions. 0 100 200 0 100 200 300 400 500 300 400 500 -### <Operation> concatenate(const OtherDerived& other, Axis axis) +### concatenate(const OtherDerived& other, Axis axis) TODO -### <Operation> pad(const PaddingDimensions& padding) +### pad(const PaddingDimensions& padding) Returns a view of the input tensor in which the input is padded with zeros. @@ -1519,7 +1519,7 @@ Returns a view of the input tensor in which the input is padded with zeros. 0 0 0 0 -### <Operation> extract_patches(const PatchDims& patch_dims) +### extract_patches(const PatchDims& patch_dims) Returns a tensor of coefficient patches extracted from the input tensor, where each patch is of dimension specified by 'patch_dims'. The returned tensor has @@ -1606,7 +1606,7 @@ patch index: 5 6 7 10 11 -### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) @@ -1663,7 +1663,7 @@ sizes: ## Special Operations -### <Operation> cast<T>() +### cast() Returns a tensor of type T with the same dimensions as the original tensor. The returned tensor contains the values of the original tensor converted to @@ -1692,7 +1692,7 @@ but you can easily cast the tensors to floats to do the division: 1 2 2 -### <Operation> eval() +### eval() TODO @@ -1701,7 +1701,7 @@ TODO Scalar values are often represented by tensors of size 1 and rank 1. It would be more logical and user friendly to use tensors of rank 0 instead. For example -Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +Tensor::maximum() currently returns a Tensor. Similarly, the inner product of 2 1d tensors (through contractions) returns a 1d tensor. In the future these operations might be updated to return 0d tensors instead. -- cgit v1.2.3 From 01b5333e44eea55f1dfc5edd7ec8ffec866d64ca Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 30 Mar 2016 11:02:33 -0400 Subject: bug #1186 - vreinterpretq_u64_f64 fails to build on Android/Aarch64/Clang toolchain --- Eigen/src/Core/arch/NEON/PacketMath.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fc4c0d03a..fead02916 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -532,20 +532,21 @@ ptranspose(PacketBlock& kernel) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_u64_f64 (float64x2_t __a) +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template +uint64x2_t vreinterpretq_u64_f64(T a) { - return (uint64x2_t) __a; + return (uint64x2_t) a; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_f64_u64 (uint64x2_t __a) +template +float64x2_t vreinterpretq_f64_u64(T a) { - return (float64x2_t) __a; + return (float64x2_t) a; } -#endif typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; -- cgit v1.2.3 From 1b40abbf99d2022d1167063f7e52126cbe8d76bd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 13:17:03 -0700 Subject: Added missing assignment operator to the TensorUInt128 class, and made misc small improvements --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 13 ++++++++++++- unsupported/test/cxx11_tensor_uint128.cpp | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 02d6646d8..543a444fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -40,14 +40,25 @@ struct TensorUInt128 EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128& operator = (const TensorUInt128& other) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + high = other.high; + low = other.low; + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { + eigen_assert(x < NumTraits::highest()); eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { } + TensorUInt128(HIGH y, LOW x) : high(y), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { return low; diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index 2cbc45716..d2a1e8673 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -147,7 +147,7 @@ void test_misc2() { void test_cxx11_tensor_uint128() { #ifdef EIGEN_NO_INT128 - // Skip the test on compilers that don't support 128bit integers natively + // Skip the test on compilers that don't support 128bit integers natively return; #else CALL_SUBTEST_1(test_add()); -- cgit v1.2.3 From 483aaad10a925b5b22ea87bcabe01712db4fe870 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:08:13 -0700 Subject: Fixed compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 543a444fb..f68ac1794 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -53,7 +53,9 @@ struct TensorUInt128 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - eigen_assert(x < NumTraits::highest()); + typedef typename conditional::type UnsignedT; + typedef typename conditional::type UnsignedLow; + eigen_assert(static_cast(x) < static_cast(NumTraits::highest())); eigen_assert(x >= 0); } -- cgit v1.2.3 From bc68fc2fe73adba1cf4d0b40d99d201c3f12bb64 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:58:32 -0700 Subject: Enable constant expressions when compiling cuda code with clang. --- Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index dbfc9bd37..97627d14c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -370,8 +370,8 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above -#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 70500 && __cplusplus > 199711L +// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ -- cgit v1.2.3 From 4f1a7e51c17586487c986a456e39af40b41bf4b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:59:49 -0700 Subject: Pull math functions from the global namespace only when compiling cuda code with nvcc. When compiling with clang, we want to use the std namespace. --- Eigen/Core | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index 8428c51e4..24799f32b 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -42,7 +42,10 @@ #endif -#if defined(__CUDA_ARCH__) +// When compiling CUDA device code with NVCC, pull in math functions from the +// global namespace. In host mode, and when device doee with clang, use the +// std versions. +#if defined(__CUDA_ARCH__) && defined(__NVCC__) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; -- cgit v1.2.3 From 791e5cfb6990220b2cfdb7b6f793298a5153561b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 18:36:36 -0700 Subject: Added NumTraits for type2index. --- unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 01c31c13e..985594bc8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -45,6 +45,23 @@ struct type2index { } }; +template struct NumTraits > +{ + typedef DenseIndex Real; + enum { + IsComplex = 0, + RequireInitialization = false, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + + EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static inline Real highest() { return n; } + EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } +}; + namespace internal { template EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { -- cgit v1.2.3 From af4ef540bfeb381daaae86f91d492eed39f84e68 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 18:37:19 -0700 Subject: Fixed a off-by-one bug in a debug assertion --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index f68ac1794..3e56589c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -55,7 +55,7 @@ struct TensorUInt128 explicit TensorUInt128(const T& x) : high(0), low(x) { typedef typename conditional::type UnsignedT; typedef typename conditional::type UnsignedLow; - eigen_assert(static_cast(x) < static_cast(NumTraits::highest())); + eigen_assert(static_cast(x) <= static_cast(NumTraits::highest())); eigen_assert(x >= 0); } -- cgit v1.2.3 From 8c8a79cec1b7d03be30df0e70cea236b8f52ef64 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:33:32 -0700 Subject: Fixed a typo --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 6c412159c..ace250c6f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -348,7 +348,7 @@ namespace numext { static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { return (a.x & 0x7fff) == 0x7c00; } -static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { +static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(a); #else -- cgit v1.2.3 From b575fb1d02f7a98c94a576284fbcd4ff85970120 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:43:59 -0700 Subject: Added NumTraits for half floats --- Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index ace250c6f..dc7119c06 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -341,6 +341,18 @@ template<> struct is_arithmetic { enum { value = true }; }; } // end namespace internal +template<> struct NumTraits + : GenericNumTraits +{ + EIGEN_DEVICE_FUNC static inline float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static inline Eigen::half highest() { + return internal::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static inline Eigen::half lowest() { + return internal::raw_uint16_to_half(0xfbff); + } +}; + // Infinity/NaN checks. namespace numext { -- cgit v1.2.3 From c36ab1990247a5b60bcad564759e8903f30fbab5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:55:03 -0700 Subject: Added __ldg primitive for fp16. --- Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index dc7119c06..a2a2bac37 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -423,7 +423,14 @@ using ::ceil; __device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast(__shfl_xor(static_cast(var), laneMask, width)); } +#endif +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 +static inline EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::internal::raw_uint16_to_half( + __ldg(reinterpret_cast(ptr))); +} #endif -- cgit v1.2.3 From 4c859181daa3807f54ee7ae8add6bac66e896ace Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 12:48:38 -0700 Subject: Made it possible to use the NumTraits for complex and Array in a cuda kernel. --- Eigen/src/Core/NumTraits.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index b7b5e7d22..e065fa714 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -153,7 +153,9 @@ template struct NumTraits > MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; + EIGEN_DEVICE_FUNC static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return NumTraits::dummy_precision(); } }; @@ -166,7 +168,7 @@ struct NumTraits > typedef typename NumTraits::NonInteger NonIntegerScalar; typedef Array NonInteger; typedef ArrayType & Nested; - + enum { IsComplex = NumTraits::IsComplex, IsInteger = NumTraits::IsInteger, @@ -176,8 +178,10 @@ struct NumTraits > AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost }; - + + EIGEN_DEVICE_FUNC static inline RealScalar epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } }; -- cgit v1.2.3 From 0f5cc504fe2e024c723943c55cf87eedfe12dd8f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 12:59:39 -0700 Subject: Properly gate the fft code --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 7086a426d..1918392d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -10,8 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H -// NVCC fails to compile this code -#if !defined(__CUDACC__) +// This code requires the ability to initialize arrays of constant +// values directly inside a class. +#ifdef EIGEN_HAS_CONSTEXPR namespace Eigen { @@ -638,7 +639,7 @@ struct TensorEvaluator, D } // end namespace Eigen -#endif // __CUDACC__ +#endif // EIGEN_HAS_CONSTEXPR #endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H -- cgit v1.2.3 From f197813f370c7977bdd6023c13e08dfaf1f9498d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 13:09:23 -0700 Subject: Added the ability to hash a fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index a2a2bac37..44645522a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -414,6 +414,13 @@ using ::log; using ::sqrt; using ::floor; using ::ceil; +template <> + +struct hash { + size_t operator()(const Eigen::half& a) const { + return std::hash()(a.x); + } +}; } // end namespace std -- cgit v1.2.3 From 92b7f7b6503f2fa66e1f346b88fb6bff434d4d1d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 13:09:58 -0700 Subject: Improved code formating --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 44645522a..70050358c 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -414,8 +414,8 @@ using ::log; using ::sqrt; using ::floor; using ::ceil; -template <> +template <> struct hash { size_t operator()(const Eigen::half& a) const { return std::hash()(a.x); -- cgit v1.2.3 From 0ea7ab4f623864c82163d106cc93c8a97e4baac6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 14:44:55 -0700 Subject: Hashing was only officially introduced in c++11. Therefore only define an implementation of the hash function for float16 if c++11 is enabled. --- Eigen/src/Core/arch/CUDA/Half.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 70050358c..212aa0d5d 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -415,12 +415,14 @@ using ::sqrt; using ::floor; using ::ceil; +#if __cplusplus > 199711L template <> struct hash { size_t operator()(const Eigen::half& a) const { return std::hash()(a.x); } }; +#endif } // end namespace std -- cgit v1.2.3 From 3da495e6b9a9e8def7914b53a8698a09b1998037 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 18:11:51 -0700 Subject: Relaxed the condition used to gate the fft code. --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 1918392d1..d6db45ade 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -12,7 +12,7 @@ // This code requires the ability to initialize arrays of constant // values directly inside a class. -#ifdef EIGEN_HAS_CONSTEXPR +#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 namespace Eigen { @@ -565,7 +565,7 @@ struct TensorEvaluator, D // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; - RealScalar m_sin_PI_div_n_LUT[32] = { + const RealScalar m_sin_PI_div_n_LUT[32] = { RealScalar(0.0), RealScalar(-2), RealScalar(-0.999999999999999), @@ -601,7 +601,7 @@ struct TensorEvaluator, D }; // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); - RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { RealScalar(0.0), RealScalar(0.0), RealScalar(-1.00000000000000e+00), -- cgit v1.2.3 From dd5d390daf3a3a561a772b64f1b602e5f240bf8b Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Fri, 1 Apr 2016 13:32:29 +0100 Subject: Added zeta function. --- Eigen/src/Core/GenericPacketMath.h | 5 + Eigen/src/Core/GlobalFunctions.h | 1 + Eigen/src/Core/SpecialFunctions.h | 189 ++++++++++++++++++++++++ Eigen/src/Core/arch/CUDA/MathFunctions.h | 14 ++ Eigen/src/Core/arch/CUDA/PacketMath.h | 1 + Eigen/src/Core/functors/UnaryFunctors.h | 22 +++ Eigen/src/plugins/ArrayCwiseUnaryOps.h | 9 ++ test/array.cpp | 9 ++ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 + 9 files changed, 256 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 802def51d..988fc9c99 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -76,6 +76,7 @@ struct default_packet_traits HasTanh = 0, HasLGamma = 0, HasDiGamma = 0, + HasZeta = 0, HasErf = 0, HasErfc = 0, HasIGamma = 0, @@ -450,6 +451,10 @@ Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } /** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + +/** \internal \returns the zeta function of two arguments (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } /** \internal \returns the erf(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 7df0fdda9..a013cca1f 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -51,6 +51,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(zeta,scalar_zeta_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 37ebb5915..4240ebf2f 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -722,6 +722,189 @@ struct igamma_impl { #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of Riemann zeta function of two arguments * + ****************************************************************************/ + +template +struct zeta_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x, Scalar q) { + /* zeta.c + * + * Riemann zeta function of two arguments + * + * + * + * SYNOPSIS: + * + * double x, q, y, zeta(); + * + * y = zeta( x, q ); + * + * + * + * DESCRIPTION: + * + * + * + * inf. + * - -x + * zeta(x,q) = > (k+q) + * - + * k=0 + * + * where x > 1 and q is not a negative integer or zero. + * The Euler-Maclaurin summation formula is used to obtain + * the expansion + * + * n + * - -x + * zeta(x,q) = > (k+q) + * - + * k=1 + * + * 1-x inf. B x(x+1)...(x+2j) + * (n+q) 1 - 2j + * + --------- - ------- + > -------------------- + * x-1 x - x+2j+1 + * 2(n+q) j=1 (2j)! (n+q) + * + * where the B2j are Bernoulli numbers. Note that (see zetac.c) + * zeta(x,1) = zetac(x) + 1. + * + * + * + * ACCURACY: + * + * + * + * REFERENCE: + * + * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, + * Series, and Products, p. 1073; Academic Press, 1980. + * + */ + + int i; + /*double a, b, k, s, t, w;*/ + Scalar p, r, a, b, k, s, t, w; + + const double A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + const Scalar maxnum = NumTraits::infinity(); + const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar machep = igamma_helper::machep(); + + if( x == one ) + return maxnum; //goto retinf; + + if( x < one ) + { + // domerr: + // mtherr( "zeta", DOMAIN ); + return zero; + } + + if( q <= zero ) + { + if(q == numext::floor(q)) + { + // mtherr( "zeta", SING ); + // retinf: + return maxnum; + } + p = x; + r = numext::floor(p); + // if( x != floor(x) ) + // goto domerr; /* because q^-x not defined */ + if (p != r) + return zero; + } + + /* Euler-Maclaurin summation formula */ + /* + if( x < 25.0 ) + */ + { + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. + */ + s = numext::pow( q, -x ); + a = q; + i = 0; + b = zero; + while( (i < 9) || (a <= Scalar(9.0)) ) + { + i += 1; + a += one; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return s; // goto done; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) + return s; // goto done; + k += one; + a *= x + k; + b /= w; + k += one; + } + // done: + return(s); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + } // end namespace internal namespace numext { @@ -737,6 +920,12 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) digamma(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); } + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) +zeta(const Scalar& x, const Scalar& q) { + return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); +} template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 6822700f8..858775523 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -91,6 +91,20 @@ double2 pdigamma(const double2& a) using numext::digamma; return make_double2(digamma(a.x), digamma(a.y)); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta(const float4& a) +{ + using numext::zeta; + return make_float4(zeta(a.x), zeta(a.y), zeta(a.z), zeta(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta(const double2& a) +{ + using numext::zeta; + return make_double2(zeta(a.x), zeta(a.y)); +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perf(const float4& a) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 56822838e..e0db18fbf 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -40,6 +40,7 @@ template<> struct packet_traits : default_packet_traits HasRsqrt = 1, HasLGamma = 1, HasDiGamma = 1, + HasZeta = 1, HasErf = 1, HasErfc = 1, HasIgamma = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 531beead6..e2fb8d8d6 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -448,6 +448,28 @@ struct functor_traits > PacketAccess = packet_traits::HasDiGamma }; }; + +/** \internal + * \brief Template functor to compute the Riemann Zeta function of two arguments. + * \sa class CwiseUnaryOp, Cwise::zeta() + */ +template struct scalar_zeta_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const { + using numext::zeta; return zeta(x, q); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasZeta + }; +}; /** \internal * \brief Template functor to compute the Gauss error function of a diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 2ce7414a1..6c92a2f1b 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -23,6 +23,7 @@ typedef CwiseUnaryOp, const Derived> SinhReturn typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> LgammaReturnType; typedef CwiseUnaryOp, const Derived> DigammaReturnType; +typedef CwiseUnaryOp, const Derived> ZetaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; typedef CwiseUnaryOp, const Derived> PowReturnType; @@ -329,6 +330,14 @@ digamma() const return DigammaReturnType(derived()); } +/** \returns an expression of the coefficient-wise zeta function. + */ +inline const ZetaReturnType +zeta() const +{ + return ZetaReturnType(derived()); +} + /** \returns an expression of the coefficient-wise Gauss error * function of *this. * diff --git a/test/array.cpp b/test/array.cpp index d05744c4a..2f0b0f1b6 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -322,6 +322,15 @@ template void array_real(const ArrayType& m) std::numeric_limits::infinity()); VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); + + // Check the zeta function against scipy.special.zeta + VERIFY_IS_APPROX(numext::zeta(Scalar(1.5), Scalar(2)), RealScalar(1.61237534869)); + VERIFY_IS_APPROX(numext::zeta(Scalar(4), Scalar(1.5)), RealScalar(0.234848505667)); + VERIFY_IS_APPROX(numext::zeta(Scalar(10.5), Scalar(3)), RealScalar(1.03086757337e-5)); + VERIFY_IS_APPROX(numext::zeta(Scalar(10000.5), Scalar(1.0001)), RealScalar(0.367879440865)); + VERIFY_IS_APPROX(numext::zeta(Scalar(3), Scalar(-2.5)), RealScalar(0.054102025820864097)); + VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter + std::numeric_limits::infinity()); { // Test various propreties of igamma & igammac. These are normalized diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 6ee9c88b9..7c427d3c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -132,6 +132,12 @@ class TensorBase digamma() const { return unaryExpr(internal::scalar_digamma_op()); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + zeta() const { + return unaryExpr(internal::scalar_zeta_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> -- cgit v1.2.3 From 57239f4a8149dbd603ad376e90a0a4574b846710 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Fri, 1 Apr 2016 14:35:21 +0100 Subject: Added polygamma function. --- Eigen/src/Core/GenericPacketMath.h | 5 +++ Eigen/src/Core/GlobalFunctions.h | 1 + Eigen/src/Core/SpecialFunctions.h | 52 ++++++++++++++++++++++++- Eigen/src/Core/arch/CUDA/MathFunctions.h | 22 +++++++++-- Eigen/src/Core/arch/CUDA/PacketMath.h | 1 + Eigen/src/Core/functors/UnaryFunctors.h | 22 +++++++++++ Eigen/src/plugins/ArrayCwiseUnaryOps.h | 9 +++++ test/array.cpp | 5 +++ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 +++ 9 files changed, 118 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 988fc9c99..6ff61c18a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -77,6 +77,7 @@ struct default_packet_traits HasLGamma = 0, HasDiGamma = 0, HasZeta = 0, + HasPolygamma = 0, HasErf = 0, HasErfc = 0, HasIGamma = 0, @@ -456,6 +457,10 @@ Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } +/** \internal \returns the polygamma function (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); } + /** \internal \returns the erf(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perf(const Packet& a) { using numext::erf; return erf(a); } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index a013cca1f..05ba6ddb4 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -52,6 +52,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(zeta,scalar_zeta_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(polygamma,scalar_polygamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 4240ebf2f..02ac7cf3f 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -736,7 +736,7 @@ struct zeta_retval { template struct zeta_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar x) { + static Scalar run(Scalar x, Scalar q) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -905,6 +905,50 @@ struct zeta_impl { #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of polygamma function * + ****************************************************************************/ + +template +struct polygamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + Scalar zero = 0.0, one = 1.0; + Scalar nplus = n + one; + + // Just return the digamma function for n = 1 + if (n == zero) { + return digamma_impl::run(x); + } + // Use the same implementation as scipy + else { + Scalar factorial = numext::exp(lgamma_impl::run(nplus)); + return numext::pow(-one, nplus) * factorial * zeta_impl::run(nplus, x); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + } // end namespace internal namespace numext { @@ -927,6 +971,12 @@ zeta(const Scalar& x, const Scalar& q) { return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); } +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) +polygamma(const Scalar& n, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x); +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) { diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 858775523..317499b29 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -93,17 +93,31 @@ double2 pdigamma(const double2& a) } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float4 pzeta(const float4& a) +float4 pzeta(const float4& x, const float4& q) { using numext::zeta; - return make_float4(zeta(a.x), zeta(a.y), zeta(a.z), zeta(a.w)); + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double2 pzeta(const double2& a) +double2 pzeta(const double2& x, const double2& q) { using numext::zeta; - return make_double2(zeta(a.x), zeta(a.y)); + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index e0db18fbf..932df1092 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -41,6 +41,7 @@ template<> struct packet_traits : default_packet_traits HasLGamma = 1, HasDiGamma = 1, HasZeta = 1, + HasPolygamma = 1, HasErf = 1, HasErfc = 1, HasIgamma = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index e2fb8d8d6..826d84f69 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -471,6 +471,28 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the polygamma function. + * \sa class CwiseUnaryOp, Cwise::polygamma() + */ +template struct scalar_polygamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const { + using numext::polygamma; return polygamma(n, x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasPolygamma + }; +}; + /** \internal * \brief Template functor to compute the Gauss error function of a * scalar diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 6c92a2f1b..56c71172c 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -24,6 +24,7 @@ typedef CwiseUnaryOp, const Derived> CoshReturn typedef CwiseUnaryOp, const Derived> LgammaReturnType; typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ZetaReturnType; +typedef CwiseUnaryOp, const Derived> PolygammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; typedef CwiseUnaryOp, const Derived> PowReturnType; @@ -338,6 +339,14 @@ zeta() const return ZetaReturnType(derived()); } +/** \returns an expression of the coefficient-wise polygamma function. + */ +inline const PolygammaReturnType +polygamma() const +{ + return PolygammaReturnType(derived()); +} + /** \returns an expression of the coefficient-wise Gauss error * function of *this. * diff --git a/test/array.cpp b/test/array.cpp index 2f0b0f1b6..56d196923 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -331,6 +331,11 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(numext::zeta(Scalar(3), Scalar(-2.5)), RealScalar(0.054102025820864097)); VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter std::numeric_limits::infinity()); + + // Check the polygamma against scipy.special.polygamma + VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(2)), RealScalar(0.644934066848)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(3)), RealScalar(0.394934066848)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(25.5)), RealScalar(0.0399946696496)); { // Test various propreties of igamma & igammac. These are normalized diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 7c427d3c1..65b969aaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -138,6 +138,12 @@ class TensorBase zeta() const { return unaryExpr(internal::scalar_zeta_op()); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + polygamma() const { + return unaryExpr(internal::scalar_polygamma_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> -- cgit v1.2.3 From 3cb0a237c195dd470ff5cd7a6e793b74d2d6815d Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Fri, 1 Apr 2016 17:51:39 +0100 Subject: Fixed suggestions by Eugene Brevdo. --- Eigen/src/Core/SpecialFunctions.h | 92 +++++++++++++++++---------------------- test/array.cpp | 14 +++++- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 02ac7cf3f..26b92607c 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -806,10 +806,9 @@ struct zeta_impl { */ int i; - /*double a, b, k, s, t, w;*/ Scalar p, r, a, b, k, s, t, w; - const double A[] = { + const Scalar A[] = { 12.0, -720.0, 30240.0, @@ -829,12 +828,10 @@ struct zeta_impl { const Scalar machep = igamma_helper::machep(); if( x == one ) - return maxnum; //goto retinf; + return maxnum; if( x < one ) { - // domerr: - // mtherr( "zeta", DOMAIN ); return zero; } @@ -842,64 +839,53 @@ struct zeta_impl { { if(q == numext::floor(q)) { - // mtherr( "zeta", SING ); - // retinf: return maxnum; } p = x; r = numext::floor(p); - // if( x != floor(x) ) - // goto domerr; /* because q^-x not defined */ if (p != r) return zero; } - /* Euler-Maclaurin summation formula */ - /* - if( x < 25.0 ) + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. */ + s = numext::pow( q, -x ); + a = q; + i = 0; + b = zero; + while( (i < 9) || (a <= Scalar(9)) ) { - /* Permit negative q but continue sum until n+q > +9 . - * This case should be handled by a reflection formula. - * If q<0 and x is an integer, there is a relation to - * the polygamma function. - */ - s = numext::pow( q, -x ); - a = q; - i = 0; - b = zero; - while( (i < 9) || (a <= Scalar(9.0)) ) - { - i += 1; - a += one; - b = numext::pow( a, -x ); - s += b; - if( numext::abs(b/s) < machep ) - return s; // goto done; - } - - w = a; - s += b*w/(x-one); - s -= half * b; - a = one; - k = zero; - for( i=0; i<12; i++ ) - { - a *= x + k; - b /= w; - t = a*b/A[i]; - s = s + t; - t = numext::abs(t/s); - if( t < machep ) - return s; // goto done; - k += one; - a *= x + k; - b /= w; - k += one; - } - // done: - return(s); - } + i += 1; + a += one; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return s; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) + return s; + k += one; + a *= x + k; + b /= w; + k += one; + } + return s; } }; diff --git a/test/array.cpp b/test/array.cpp index 56d196923..8b0a34722 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -332,10 +332,22 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter std::numeric_limits::infinity()); - // Check the polygamma against scipy.special.polygamma + // Check the polygamma against scipy.special.polygamma examples VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(2)), RealScalar(0.644934066848)); VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(3)), RealScalar(0.394934066848)); VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(25.5)), RealScalar(0.0399946696496)); + + // Check the polygamma function over a larger range of values + VERIFY_IS_APPROX(numext::polygamma(Scalar(17), Scalar(4.7)), RealScalar(293.334565435)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(31), Scalar(11.8)), RealScalar(0.445487887616)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(28), Scalar(17.7)), RealScalar(-2.47810300902e-07)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(8), Scalar(30.2)), RealScalar(-8.29668781082e-09)); + /* The following tests only pass for doubles because floats cannot handle the large values of + the gamma function. + VERIFY_IS_APPROX(numext::polygamma(Scalar(42), Scalar(15.8)), RealScalar(-0.434562276666)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(147), Scalar(54.1)), RealScalar(0.567742190178)); + VERIFY_IS_APPROX(numext::polygamma(Scalar(170), Scalar(64)), RealScalar(-0.0108615497927)); + */ { // Test various propreties of igamma & igammac. These are normalized -- cgit v1.2.3 From ffd770ce94b75202187635bf0e1e4d0006f4a015 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Fri, 1 Apr 2016 17:58:24 +0100 Subject: Fixed CUDA signature. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 65b969aaf..77b509f61 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -132,18 +132,6 @@ class TensorBase digamma() const { return unaryExpr(internal::scalar_digamma_op()); } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - zeta() const { - return unaryExpr(internal::scalar_zeta_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - polygamma() const { - return unaryExpr(internal::scalar_polygamma_op()); - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> @@ -365,6 +353,20 @@ class TensorBase igammac(const OtherDerived& other) const { return binaryExpr(other.derived(), internal::scalar_igammac_op()); } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } // comparisons and tests for Scalars EIGEN_DEVICE_FUNC -- cgit v1.2.3 From eb0ae602bd97866b13524bf3c2593f1dd0b261bf Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Fri, 1 Apr 2016 18:17:45 +0100 Subject: Added CUDA tests. --- unsupported/test/cxx11_tensor_cuda.cu | 121 ++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 4d8465756..fc56ae71d 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -626,6 +626,127 @@ void test_cuda_digamma() } } +template +void test_cuda_zeta() +{ + Tensor in_x(6); + Tensor in_q(6); + Tensor out(6); + Tensor expected_out(6); + out.setZero(); + + in_x(0) = Scalar(1); + in_x(1) = Scalar(1.5); + in_x(2) = Scalar(4); + in_x(3) = Scalar(-10.5); + in_x(4) = Scalar(10000.5); + in_x(5) = Scalar(3); + + in_q(0) = Scalar(1.2345); + in_q(1) = Scalar(2); + in_q(2) = Scalar(1.5); + in_q(3) = Scalar(3); + in_q(4) = Scalar(1.0001); + in_q(5) = Scalar(-2.5); + + expected_out(0) = std::numeric_limits::infinity(); + expected_out(1) = Scalar(1.61237534869); + expected_out(2) = Scalar(0.234848505667); + expected_out(3) = Scalar(1.03086757337e-5); + expected_out(4) = Scalar(0.367879440865); + expected_out(5) = Scalar(0.054102025820864097); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x, d_in_q; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_q), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in_x(d_in_x, 6); + Eigen::TensorMap > gpu_in_q(d_in_q, 6); + Eigen::TensorMap > gpu_out(d_out, 6); + + gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(out(0), expected_out(0)); + + for (int i = 1; i < 6; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } +} + +template +void test_cuda_polygamma() +{ + Tensor in_x(7); + Tensor in_n(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in_n(0) = Scalar(1); + in_n(1) = Scalar(1); + in_n(2) = Scalar(1); + in_n(3) = Scalar(17); + in_n(4) = Scalar(31); + in_n(5) = Scalar(28); + in_n(6) = Scalar(8); + + in_x(0) = Scalar(2); + in_x(1) = Scalar(3); + in_x(2) = Scalar(25.5); + in_x(3) = Scalar(4.7); + in_x(4) = Scalar(11.8); + in_x(5) = Scalar(17.7); + in_x(6) = Scalar(30.2); + + expected_out(0) = Scalar(0.644934066848); + expected_out(1) = Scalar(0.394934066848); + expected_out(2) = Scalar(0.0399946696496); + expected_out(3) = Scalar(293.334565435); + expected_out(4) = Scalar(0.445487887616); + expected_out(5) = Scalar(-2.47810300902e-07); + expected_out(6) = Scalar(-8.29668781082e-09); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x, d_in_n; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_n), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in_x(d_in_x, 7); + Eigen::TensorMap > gpu_in_n(d_in_n, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in_n.zeta(gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 7; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } +} + template void test_cuda_igamma() { -- cgit v1.2.3 From 2b457f8e5ec2bd38ce5049bd9220d3f37c7fa1ff Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 4 Apr 2016 11:47:46 +0200 Subject: Fix cross-compiling windows version detection --- cmake/EigenDetermineOSVersion.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/EigenDetermineOSVersion.cmake b/cmake/EigenDetermineOSVersion.cmake index 3c48d4c37..9246fa67c 100644 --- a/cmake/EigenDetermineOSVersion.cmake +++ b/cmake/EigenDetermineOSVersion.cmake @@ -26,7 +26,7 @@ function(DetermineShortWindowsName WIN_VERSION win_num_version) endfunction() function(DetermineOSVersion OS_VERSION) - if (WIN32) + if (WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows) file (TO_NATIVE_PATH "$ENV{COMSPEC}" SHELL) exec_program( ${SHELL} ARGS "/c" "ver" OUTPUT_VARIABLE ver_output) -- cgit v1.2.3 From 1108b4f21836d52b50e4ec10a6e0eec027eda04d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Apr 2016 11:09:25 -0700 Subject: Fixed the signature of numext::abs to make it compatible with complex numbers --- Eigen/src/Core/MathFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 000cafee7..e6c7dfa08 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1034,7 +1034,7 @@ double tan(const double &x) { return ::tan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T abs(const T &x) { +typename NumTraits::Real abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } -- cgit v1.2.3 From c4179dd470f72520b9ffba5b78d4dd1261ccc609 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Apr 2016 11:11:51 -0700 Subject: Updated the scalar_abs_op struct to make it compatible with cuda devices. --- Eigen/src/Core/functors/UnaryFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 531beead6..46622f804 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -41,7 +41,7 @@ struct functor_traits > template struct scalar_abs_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pabs(a); } -- cgit v1.2.3 From b97911dd189c0377df6ba4ef1a710105b9437a3c Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Mon, 4 Apr 2016 19:16:03 +0100 Subject: Refactored code into type-specific helper functions. --- Eigen/src/Core/SpecialFunctions.h | 86 +++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 26b92607c..772449bc7 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -744,6 +744,56 @@ struct zeta_impl { }; #else + +template +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static bool run(float& a, float& b, float& s, const float x, const float machep) { + int i = 0; + while(i < 9) + { + i += 1; + a += 1.0f; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static bool run(double& a, double& b, double& s, const double x, const double machep) { + int i = 0; + while( (i < 9) || (a <= 9.0) ) + { + i += 1; + a += 1.0; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; template struct zeta_impl { @@ -809,18 +859,18 @@ struct zeta_impl { Scalar p, r, a, b, k, s, t, w; const Scalar A[] = { - 12.0, - -720.0, - 30240.0, - -1209600.0, - 47900160.0, - -1.8924375803183791606e9, /*1.307674368e12/691*/ - 7.47242496e10, - -2.950130727918164224e12, /*1.067062284288e16/3617*/ - 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ - -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ - 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ - -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + Scalar(12.0), + Scalar(-720.0), + Scalar(30240.0), + Scalar(-1209600.0), + Scalar(47900160.0), + Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/ + Scalar(7.47242496e10), + Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/ + Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/ + Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/ + Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/ + Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/ }; const Scalar maxnum = NumTraits::infinity(); @@ -854,16 +904,10 @@ struct zeta_impl { */ s = numext::pow( q, -x ); a = q; - i = 0; b = zero; - while( (i < 9) || (a <= Scalar(9)) ) - { - i += 1; - a += one; - b = numext::pow( a, -x ); - s += b; - if( numext::abs(b/s) < machep ) - return s; + // Run the summation in a helper function that is specific to the floating precision + if (zeta_impl_series::run(a, b, s, x, machep)) { + return s; } w = a; -- cgit v1.2.3 From 03f2997a119578b894ee717aff23e2641ab78f37 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 4 Apr 2016 16:41:47 -0400 Subject: bug #1191 - Prevent Clang/ARM from rewriting VMLA into VMUL+VADD --- Eigen/src/Core/arch/NEON/PacketMath.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fead02916..10ef1d2b3 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -186,7 +186,25 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // MLA: 10 GFlop/s ; FMA: 12 GFlops/s. template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM + // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, + // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on + // -march=armv7-a, that is a very common case. + // See e.g. this thread: + // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + Packet4f r = c; + asm volatile( + "vmla.f32 %q[r], %q[a], %q[b]" + : [r] "+w" (r) + : [a] "w" (a), + [b] "w" (b) + : ); + return r; +#else + return vmlaq_f32(c,a,b); +#endif +} #endif // No FMA instruction for int, so use MLA unconditionally. -- cgit v1.2.3 From 158fea0f5e15e4611c36ce73f582c484deeace1a Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 4 Apr 2016 16:42:40 -0400 Subject: bug #1190 - Don't trust __ARM_FEATURE_FMA on Clang/ARM --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 10ef1d2b3..63a2d9f52 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -177,7 +177,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co return pset1(0); } -#ifdef __ARM_FEATURE_FMA +// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, +// then implements a slow software scalar fallback calling fmaf()! +#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) // See bug 936. // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -- cgit v1.2.3 From 988344daf12681cbb50373c7a04cd92cfc8e18d7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 5 Apr 2016 05:59:30 -0400 Subject: enable the other includes as well --- Eigen/Core | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index cc4ac5843..b7d254255 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -337,8 +337,8 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" -// #include "src/Core/arch/ZVector/MathFunctions.h" -// #include "src/Core/arch/ZVector/Complex.h" + #include "src/Core/arch/ZVector/MathFunctions.h" + #include "src/Core/arch/ZVector/Complex.h" #endif #include "src/Core/arch/CUDA/Half.h" -- cgit v1.2.3 From 644d0f91d2c1ecb3ed8c64c241f1ce6429ed1ca0 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 5 Apr 2016 05:59:54 -0400 Subject: enable all tests again --- test/packetmath.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 7309a85f8..37da6c86f 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -471,7 +471,7 @@ template void packetmath_notcomplex() internal::pstore(data2, internal::plset(data1[0])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); } -/* + template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { typedef internal::packet_traits PacketTraits; @@ -530,7 +530,7 @@ template void packetmath_complex() internal::pstore(pval,internal::pcplxflip(internal::pload(data1))); VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip"); } -}*/ +} template void packetmath_scatter_gather() { @@ -575,23 +575,23 @@ void test_packetmath() CALL_SUBTEST_1( packetmath() ); CALL_SUBTEST_2( packetmath() ); CALL_SUBTEST_3( packetmath() ); -/* CALL_SUBTEST_4( packetmath >() ); - CALL_SUBTEST_5( packetmath >() );*/ + CALL_SUBTEST_4( packetmath >() ); + CALL_SUBTEST_5( packetmath >() ); CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); CALL_SUBTEST_3( packetmath_notcomplex() ); -/* CALL_SUBTEST_1( packetmath_real() ); - CALL_SUBTEST_2( packetmath_real() );*/ + CALL_SUBTEST_1( packetmath_real() ); + CALL_SUBTEST_2( packetmath_real() ); -/* CALL_SUBTEST_4( packetmath_complex >() ); - CALL_SUBTEST_5( packetmath_complex >() );*/ + CALL_SUBTEST_4( packetmath_complex >() ); + CALL_SUBTEST_5( packetmath_complex >() ); CALL_SUBTEST_1( packetmath_scatter_gather() ); CALL_SUBTEST_2( packetmath_scatter_gather() ); CALL_SUBTEST_3( packetmath_scatter_gather() ); -/* CALL_SUBTEST_4( packetmath_scatter_gather >() ); - CALL_SUBTEST_5( packetmath_scatter_gather >() );*/ + CALL_SUBTEST_4( packetmath_scatter_gather >() ); + CALL_SUBTEST_5( packetmath_scatter_gather >() ); } } -- cgit v1.2.3 From 2d41dc9622f9a15bcf77736ef45dc3f7e3d34bdc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 5 Apr 2016 06:00:51 -0400 Subject: complete int/double specialized traits for ZVector --- Eigen/src/Core/arch/ZVector/PacketMath.h | 512 ++++++------------------------- 1 file changed, 93 insertions(+), 419 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index c786aeec0..3586f87af 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_ZVECTOR_H #define EIGEN_PACKET_MATH_ZVECTOR_H +#include + namespace Eigen { namespace internal { @@ -31,7 +33,6 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -typedef __vector float Packet4f; typedef __vector int Packet4i; typedef __vector unsigned int Packet4ui; typedef __vector __bool int Packet4bi; @@ -42,20 +43,21 @@ typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; typedef union { - float f[4]; - double d[2]; - int i[4]; - Packet4f v4f; - Packet4i v4i; - Packet2d v2d; + int32_t i[4]; + uint32_t ui[4]; + int64_t l[2]; + uint64_t ul[2]; + double d[2]; + Packet4i v4i; + Packet4ui v4ui; + Packet2l v2l; + Packet2ul v2ul; + Packet2d v2d; } Packet; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead -#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) - #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = reinterpret_cast(vec_splat_s32(X)) @@ -65,9 +67,6 @@ typedef union { #define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ Packet2l p2l_##NAME = reinterpret_cast(vec_splat_s64(X)) -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = pset1(X) - #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = pset1(X) @@ -77,18 +76,14 @@ typedef union { #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ Packet2l p2l_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) - // These constants are endian-agnostic -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); -static Packet4f p4f_ONE = { 1.0, 1.0, 1.0, 1.0 }; static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; @@ -98,7 +93,6 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} */ -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -112,23 +106,23 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; // Handle endianness properly while loading constants // Define global static constants: -/* -static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);*/ + +static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -/* + static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; -static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; +/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; -static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/ static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; -static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/ static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; //static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; @@ -139,29 +133,6 @@ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32 #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - - // FIXME check the Has* - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 0 - }; -}; - template<> struct packet_traits : default_packet_traits { typedef Packet4i type; @@ -178,11 +149,7 @@ template<> struct packet_traits : default_packet_traits HasSub = 1, HasMul = 1, HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 0 + HasBlend = 1 }; }; @@ -203,88 +170,60 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasSin = 0, HasCos = 0, - HasLog = 1, + HasLog = 0, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) { - union { - Packet16uc v; - unsigned char n[16]; - } vt; - vt.v = v; - for (int i=0; i< 16; i++) - s << (int)vt.n[i] << ", "; + Packet vt; + vt.v4i = v; + s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3]; return s; } -inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) { - union { - Packet4f v; - float n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + Packet vt; + vt.v4ui = v; + s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3]; return s; } -inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { - union { - Packet4i v; - int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + Packet vt; + vt.v2l = v; + s << vt.l[0] << ", " << vt.l[1]; return s; } -inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v) { - union { - Packet4ui v; - unsigned int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + Packet vt; + vt.v2ul = v; + s << vt.ul[0] << ", " << vt.ul[1] ; return s; } inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) { - union { - Packet2d v; - double n[2]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1]; + Packet vt; + vt.v2d = v; + s << vt.d[0] << ", " << vt.d[1]; return s; } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 4)); break; - case 2: - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); break; - case 3: - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 12)); break; - } - } -}; - template struct palign_impl { @@ -311,16 +250,6 @@ struct palign_impl } }; -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v4f; -} - template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { // FIXME: No intrinsic yet @@ -338,15 +267,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) return vfrom->v2d; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v4f = from; -} - template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { // FIXME: No intrinsic yet @@ -365,15 +285,6 @@ template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& vto->v2d = from; } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - // FIXME: Check if proper intrinsic exists - Packet res; - res.f[0] = from; - res.v4f = reinterpret_cast(vec_splats(res.i[0])); - return res.v4f; -} - template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vec_splats(from); @@ -385,17 +296,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return res; } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 0)); - a1 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 1)); - a2 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 2)); - a3 = reinterpret_cast(vec_splat(reinterpret_cast(a3), 3)); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) @@ -419,16 +319,6 @@ pbroadcast4(const double *a, a3 = vec_splat(a3, 1); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); -} - template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { int EIGEN_ALIGN16 ai[4]; @@ -447,16 +337,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou return pload(af); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { int EIGEN_ALIGN16 ai[4]; @@ -475,159 +355,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, to[1*stride] = af[1]; } -/* -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub(p4f_ZERO, a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub(p4i_ZERO, a); } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub(p2d_ZERO, a); } -*/ -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - return a; -} - -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) -{ - return reinterpret_cast(__builtin_s390_vmalf(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c))); -} +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) -{ - return vec_madd(a, b, c); -} +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - return pmadd(a,b,p4f_ZERO); -} - -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) -{ - return pmadd(a,b,p4i_ZERO); -} - -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) -{ - return pmadd(a,b,p2d_ZERO); -} - -/*template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#ifndef __VSX__ // VSX actually provides a div instruction - Packet4f t, y_0, y_1; +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } - // Altivec does not offer a divide instruction, we have to do a reciprocal approximation - y_0 = vec_re(b); +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } - // Do one Newton-Raphson iteration to get the needed accuracy - t = vec_nmsub(y_0, b, p4f_ONE); - y_1 = vec_madd(y_0, t, y_0); +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } - return vec_madd(a, y_1, p4f_ZERO); -#else - return vec_div(a, b); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) -{ eigen_assert(false && "packet integer division are not supported by AltiVec"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } -*/ +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return pmadd(a, p4f_ONE, b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return pmadd(a, p4i_ONE, b); } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return pmadd(a, p2d_ONE, b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } - -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return a; /*vec_min(a, b);*/ } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return a; /*vec_max(a, b);*/ } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return reinterpret_cast(vec_and(reinterpret_cast(a), reinterpret_cast(b))); -} - -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) -{ - return vec_and(a, b); -} - -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vec_and(a, b); -} - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return reinterpret_cast(vec_or(reinterpret_cast(a), reinterpret_cast(b))); -} - -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) -{ - return vec_or(a, b); -} - -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vec_or(a, b); -} - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return reinterpret_cast(vec_xor(reinterpret_cast(a), reinterpret_cast(b))); -} - -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) -{ - return vec_xor(a, b); -} +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vec_and(a, vec_nor(b, b)); -} +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vec_xor(a, b); -} +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - return pand(a, reinterpret_cast(vec_nor(reinterpret_cast(b), reinterpret_cast(b)))); -} +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) -{ - return pand(a, vec_nor(b, b)); -} - -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v4f; -} +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { @@ -645,14 +418,6 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) return vfrom->v2d; } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return (Packet4f) vec_perm((Packet16uc)(p), (Packet16uc)(p), p16uc_DUPLICATE32_HI); -} - template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i p; @@ -669,14 +434,6 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) return vec_perm(p, p, p16uc_PSET64_HI); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v4f = from; -} - template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE @@ -693,30 +450,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& vto->v2d = from; } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) -{ - EIGEN_ZVECTOR_PREFETCH(addr); -} - -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) -{ - EIGEN_ZVECTOR_PREFETCH(addr); -} +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) -{ - EIGEN_ZVECTOR_PREFETCH(addr); -} - -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); -} - template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); @@ -727,20 +466,9 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return a; /*vec_abs(a);*/ } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet4f b, sum; - b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); - sum = padd(a, b); - b = reinterpret_cast(vec_sld(reinterpret_cast(sum), reinterpret_cast(sum), 4)); - sum = padd(sum, b); - return pfirst(sum); -} - template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i b, sum; @@ -760,34 +488,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[2]))); - v[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[2]))); - v[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[3]))); - v[3] = reinterpret_cast(vec_mergeh(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[3]))); - // Get the resulting vectors - sum[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[0]), reinterpret_cast(v[2]))); - sum[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[0]), reinterpret_cast(v[2]))); - sum[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[1]), reinterpret_cast(v[3]))); - sum[3] = reinterpret_cast(vec_mergeh(reinterpret_cast(v[1]), reinterpret_cast(v[3]))); - - // Now do the summation: - // Lines 0+1 - sum[0] = padd(sum[0], sum[1]); - // Lines 2+3 - sum[1] = padd(sum[2], sum[3]); - // Add the results - sum[0] = padd(sum[0], sum[1]); - - return sum[0]; -} - template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { Packet4i v[4], sum[4]; @@ -829,13 +529,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - Packet4f prod; - prod = pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); - return pfirst(pmul(prod, reinterpret_cast(vec_sld(reinterpret_cast(prod), reinterpret_cast(prod), 4)))); -} - template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { EIGEN_ALIGN16 int aux[4]; @@ -849,14 +542,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) } // min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet4f b, res; - b = pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 4))); - return pfirst(res); -} - template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { Packet4i b, res; @@ -871,14 +556,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) } // max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f b, res; - b = pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8))); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 4))); - return pfirst(res); -} - template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { Packet4i b, res; @@ -893,26 +570,12 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; - t0 = reinterpret_cast(vec_mergeh(reinterpret_cast(kernel.packet[0]), reinterpret_cast(kernel.packet[2]))); - t1 = reinterpret_cast(vec_mergel(reinterpret_cast(kernel.packet[0]), reinterpret_cast(kernel.packet[2]))); - t2 = reinterpret_cast(vec_mergeh(reinterpret_cast(kernel.packet[1]), reinterpret_cast(kernel.packet[3]))); - t3 = reinterpret_cast(vec_mergel(reinterpret_cast(kernel.packet[1]), reinterpret_cast(kernel.packet[3]))); - kernel.packet[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(t0), reinterpret_cast(t2))); - kernel.packet[1] = reinterpret_cast(vec_mergel(reinterpret_cast(t0), reinterpret_cast(t2))); - kernel.packet[2] = reinterpret_cast(vec_mergeh(reinterpret_cast(t1), reinterpret_cast(t3))); - kernel.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(t1), reinterpret_cast(t3))); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); kernel.packet[0] = vec_mergeh(t0, t2); kernel.packet[1] = vec_mergel(t0, t2); kernel.packet[2] = vec_mergeh(t1, t3); @@ -921,13 +584,24 @@ ptranspose(PacketBlock& kernel) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - Packet2d t0, t1; - t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); - t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); kernel.packet[0] = t0; kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + } // end namespace internal } // end namespace Eigen -- cgit v1.2.3 From bc0ad363c64b3c3d9d988de9b7405c390618db87 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 5 Apr 2016 06:01:17 -0400 Subject: add remaining includes --- Eigen/src/Core/arch/ZVector/Complex.h | 201 ++++++++++++++++++++++++++++ Eigen/src/Core/arch/ZVector/MathFunctions.h | 110 +++++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 Eigen/src/Core/arch/ZVector/Complex.h create mode 100644 Eigen/src/Core/arch/ZVector/MathFunctions.h diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h new file mode 100644 index 000000000..9a8735ac1 --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -0,0 +1,201 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8); + v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1); + + return Packet1cd(v1 + v2); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +{ + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +{ + return pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +{ + return pfirst(a); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX32_ALTIVEC_H diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h new file mode 100644 index 000000000..6fff8524e --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + + _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); + _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + + Packet2d tmp, fx; + Packet2l emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); + + fx = vec_floor(fx); + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = vec_ctsl(fx, 0); + + static const Packet2l p2l_1023 = { 1023, 1023 }; + static const Packet2ul p2ul_52 = { 52, 52 }; + + emm0 = emm0 + p2l_1023; + emm0 = emm0 << reinterpret_cast(p2ul_52); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), + isnumber_mask); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) +{ + return __builtin_s390_vfsqdb(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. + return pset1(1.0) / psqrt(x); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H -- cgit v1.2.3 From 4d7e230d2f8a55c45c1191fe08aa19d41e869a65 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 5 Apr 2016 14:49:41 +0200 Subject: bug #1189: fix pow/atan2 compilation for AutoDiffScalar --- unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 13 +++++++------ unsupported/test/autodiff.cpp | 2 +- unsupported/test/autodiff_scalar.cpp | 4 ++++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index e30ad5b6d..481dfa91a 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -589,23 +589,24 @@ EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log, return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) template -inline const Eigen::AutoDiffScalar::Scalar>, const DerType> > -pow(const Eigen::AutoDiffScalar& x, typename Eigen::internal::traits::Scalar y) +inline const Eigen::AutoDiffScalar::type>::Scalar>, const typename internal::remove_all::type> > +pow(const Eigen::AutoDiffScalar& x, typename internal::traits::type>::Scalar y) { using namespace Eigen; - typedef typename Eigen::internal::traits::Scalar Scalar; - return AutoDiffScalar, const DerType> >( + typedef typename internal::remove_all::type DerTypeCleaned; + typedef typename Eigen::internal::traits::Scalar Scalar; + return AutoDiffScalar, const DerTypeCleaned> >( std::pow(x.value(),y), x.derivatives() * (y * std::pow(x.value(),y-1))); } template -inline const AutoDiffScalar::Scalar,Dynamic,1> > +inline const AutoDiffScalar::type>::Scalar,Dynamic,1> > atan2(const AutoDiffScalar& a, const AutoDiffScalar& b) { using std::atan2; - typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::type>::Scalar Scalar; typedef AutoDiffScalar > PlainADS; PlainADS ret; ret.value() = atan2(a.value(), b.value()); diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 1aa1b3d2d..374f86df9 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -16,7 +16,7 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y) using namespace std; // return x+std::sin(y); EIGEN_ASM_COMMENT("mybegin"); - return static_cast(x*2 - pow(x,2) + 2*sqrt(y*y) - 4 * sin(x) + 2 * cos(y) - exp(-0.5*x*x)); + return static_cast(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0)); //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2; EIGEN_ASM_COMMENT("myend"); } diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index ba4b5aec4..c631c734a 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -30,6 +30,10 @@ template void check_atan2() VERIFY_IS_APPROX(res.value(), x.value()); VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); + + res = atan2(r*s+0, r*c+0); + VERIFY_IS_APPROX(res.value(), x.value()); + VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); } -- cgit v1.2.3 From a350c25a396aa4fdef4878d165bb3dbaedf0a4bb Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Tue, 5 Apr 2016 18:20:40 +0100 Subject: Added accuracy comments. --- Eigen/src/Core/SpecialFunctions.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 772449bc7..2a0a6ff15 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -846,7 +846,12 @@ struct zeta_impl { * * ACCURACY: * + * Relative error for single precision: + * arithmetic domain # trials peak rms + * IEEE 0,25 10000 6.9e-7 1.0e-7 * + * Large arguments may produce underflow in powf(), in which + * case the results are inaccurate. * * REFERENCE: * -- cgit v1.2.3 From 317384b397faee28ad9296778aab478be1fb6b85 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 5 Apr 2016 14:56:45 -0400 Subject: complete the port, remove float support --- Eigen/src/Core/arch/ZVector/PacketMath.h | 80 +++++++++----------------------- 1 file changed, 23 insertions(+), 57 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 3586f87af..5a7226be6 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -87,12 +87,6 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; -/* -static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} -static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} -*/ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -168,16 +162,20 @@ template<> struct packet_traits : default_packet_traits HasSub = 1, HasMul = 1, HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, HasSin = 0, HasCos = 0, HasLog = 0, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 }; }; @@ -261,6 +259,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + // FIXME: No intrinsic yet EIGEN_DEBUG_ALIGNED_LOAD Packet *vfrom; vfrom = (Packet *) from; @@ -290,10 +289,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { - Packet2d res; - res = vec_splats(from); - return res; +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + return vec_splats(from); } template<> EIGEN_STRONG_INLINE void @@ -376,7 +373,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } @@ -385,75 +382,45 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } - template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v4i; -} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v2d; -} template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { - Packet4i p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + Packet4i p = pload(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { - Packet2d p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + Packet2d p = pload(from); return vec_perm(p, p, p16uc_PSET64_HI); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v4i = from; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v2d = from; -} +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -487,7 +454,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) return pfirst(sum); } - template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { Packet4i v[4], sum[4]; @@ -545,7 +511,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { Packet4i b, res; - b = pmin(a, vec_sld(a, a, 8)); + b = pmin(a, vec_sld(a, a, 8)); res = pmin(b, vec_sld(b, b, 4)); return pfirst(res); } -- cgit v1.2.3 From 72abfa11ddc1e5169d109011fc1edeea41ed57f2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 09:07:30 -0700 Subject: Added support for isfinite on fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 212aa0d5d..7f484b636 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -414,6 +414,7 @@ using ::log; using ::sqrt; using ::floor; using ::ceil; +using ::isfinite; #if __cplusplus > 199711L template <> -- cgit v1.2.3 From 7781f865cb6cc3faff3b1dfce557439abe3b56b9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 09:35:23 -0700 Subject: Renamed the EIGEN_TEST_NVCC cmake option into EIGEN_TEST_CUDA per the discussion in bug #1173. --- cmake/EigenTesting.cmake | 2 +- test/CMakeLists.txt | 8 ++++---- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 1709e0334..d5e3972b5 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -315,7 +315,7 @@ macro(ei_testing_print_summary) message(STATUS "C++11: OFF") endif() - if(EIGEN_TEST_NVCC) + if(EIGEN_TEST_CUDA) message(STATUS "CUDA: ON") else() message(STATUS "CUDA: OFF") diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4420e0c51..841c4572b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -325,9 +325,9 @@ if(EIGEN_TEST_EIGEN2) endif() -# NVCC unit tests -option(EIGEN_TEST_NVCC "Enable NVCC support in unit tests" OFF) -if(EIGEN_TEST_NVCC) +# CUDA unit tests +option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF) +if(EIGEN_TEST_CUDA) find_package(CUDA 5.0) if(CUDA_FOUND) @@ -345,7 +345,7 @@ if(CUDA_FOUND) endif(CUDA_FOUND) -endif(EIGEN_TEST_NVCC) +endif(EIGEN_TEST_CUDA) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 6bd8cfb92..7972e6776 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -175,7 +175,7 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) -if(CUDA_FOUND AND EIGEN_TEST_NVCC) +if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings # in the CUDA runtime -- cgit v1.2.3 From cf7e73addd6405a132f9848d1d2c7bf93d8afdc0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 09:59:51 -0700 Subject: Added some missing conversions to the Half class, and fixed the implementation of the < operator on cuda devices. --- Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 7f484b636..927be0493 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -175,11 +175,17 @@ __device__ bool operator != (const half& a, const half& b) { return __hne(a, b); } __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +__device__ bool operator <= (const half& a, const half& b) { return __hle(a, b); } __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } +__device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} #else // Emulate support for half floats @@ -228,9 +234,15 @@ static inline EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) static inline EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); } +static inline EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { return float(a) > float(b); } +static inline EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} #endif // Emulate support for half floats -- cgit v1.2.3 From 58c1dbff194edb5b5b7bdae7c4b01e94dac50900 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 13:44:08 -0700 Subject: Made the fp16 code more portable. --- Eigen/src/Core/arch/CUDA/Half.h | 114 +++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 47 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 927be0493..916812b61 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -55,9 +55,9 @@ namespace Eigen { namespace internal { -static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); } // end namespace internal @@ -192,55 +192,55 @@ __device__ bool operator >= (const half& a, const half& b) { // Definitions for CPUs and older CUDA, mostly working through conversion // to/from fp32. -static inline EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(float(a) + float(b)); } -static inline EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { return half(float(a) * float(b)); } -static inline EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { return half(float(a) - float(b)); } -static inline EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { return half(float(a) / float(b)); } -static inline EIGEN_DEVICE_FUNC half operator - (const half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { half result; result.x = a.x ^ 0x8000; return result; } -static inline EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { a = half(float(a) + float(b)); return a; } -static inline EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { a = half(float(a) * float(b)); return a; } -static inline EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { a = half(float(a) - float(b)); return a; } -static inline EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { a = half(float(a) / float(b)); return a; } -static inline EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { return float(a) == float(b); } -static inline EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { return float(a) != float(b); } -static inline EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); } -static inline EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { return float(a) <= float(b); } -static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { return float(a) > float(b); } -static inline EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { return float(a) >= float(b); } @@ -248,7 +248,7 @@ static inline EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) // Division by an index. Do it in full float precision to avoid accuracy // issues in converting the denominator to half. -static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { return Eigen::half(static_cast(a) / static_cast(b)); } @@ -259,7 +259,7 @@ static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { namespace internal { -static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { __half h; h.x = x; return h; @@ -270,7 +270,7 @@ union FP32 { float f; }; -static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(ff); #else @@ -318,7 +318,7 @@ static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -static inline EIGEN_DEVICE_FUNC float half_to_float(__half h) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(h); #else @@ -356,11 +356,11 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC static inline float dummy_precision() { return 1e-3f; } - EIGEN_DEVICE_FUNC static inline Eigen::half highest() { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { return internal::raw_uint16_to_half(0x7bff); } - EIGEN_DEVICE_FUNC static inline Eigen::half lowest() { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { return internal::raw_uint16_to_half(0xfbff); } }; @@ -369,10 +369,10 @@ template<> struct NumTraits namespace numext { -static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { return (a.x & 0x7fff) == 0x7c00; } -static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(a); #else @@ -385,33 +385,33 @@ static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { } // end namespace Eigen // Standard mathematical functions and trancendentals. -static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; return result; } -static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } -static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { return Eigen::half(::logf(float(a))); } -static inline EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { return Eigen::half(::sqrtf(float(a))); } -static inline EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } -static inline EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { return Eigen::half(::ceilf(float(a))); } -static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) { return (Eigen::numext::isnan)(a); } -static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) { return (Eigen::numext::isinf)(a); } -static inline EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) { return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); } @@ -420,19 +420,39 @@ namespace std { // Import the standard mathematical functions and trancendentals into the // into the std namespace. -using ::abs; -using ::exp; -using ::log; -using ::sqrt; -using ::floor; -using ::ceil; -using ::isfinite; +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + return ::fabsh(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return ::exph(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return ::logh(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return ::sqrth(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return ::floorh(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return ::ceilh(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) { + return (Eigen::numext::isnan)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) { + return (Eigen::numext::isinf)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} #if __cplusplus > 199711L template <> struct hash { - size_t operator()(const Eigen::half& a) const { - return std::hash()(a.x); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(a.x); } }; #endif @@ -442,14 +462,14 @@ struct hash { // Add the missing shfl_xor intrinsic #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 -__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast(__shfl_xor(static_cast(var), laneMask, width)); } #endif // ldg() has an overload for __half, but we also need one for Eigen::half. #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 -static inline EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::internal::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); } -- cgit v1.2.3 From 7be1eaad1e6e046922ae0673cfa115d221380040 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 14:15:37 -0700 Subject: Fixed typos in the implementation of the zeta and polygamma ops. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 56 ++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 77b509f61..0729455fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -133,6 +133,34 @@ class TensorBase return unaryExpr(internal::scalar_digamma_op()); } + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> erf() const { @@ -340,34 +368,6 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_cmp_op()); } - // igamma(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_op()); - } - - // igammac(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op()); - } - - // zeta(x = this, q = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op()); - } - - // polygamma(n = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op()); - } - // comparisons and tests for Scalars EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > -- cgit v1.2.3 From 165150e89677bf1006ee8d3a66891744f228206d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 14:31:01 -0700 Subject: Fixed the tests for the zeta and polygamma functions --- unsupported/test/cxx11_tensor_cuda.cu | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index fc56ae71d..33796690d 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -658,7 +658,8 @@ void test_cuda_zeta() std::size_t bytes = in_x.size() * sizeof(Scalar); - Scalar* d_in_x, d_in_q; + Scalar* d_in_x; + Scalar* d_in_q; Scalar* d_out; cudaMalloc((void**)(&d_in_x), bytes); cudaMalloc((void**)(&d_in_q), bytes); @@ -721,7 +722,8 @@ void test_cuda_polygamma() std::size_t bytes = in_x.size() * sizeof(Scalar); - Scalar* d_in_x, d_in_n; + Scalar* d_in_x; + Scalar* d_in_n; Scalar* d_out; cudaMalloc((void**)(&d_in_x), bytes); cudaMalloc((void**)(&d_in_n), bytes); @@ -983,8 +985,10 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); - CALL_SUBTEST_4(test_cuda_digamma()); - + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001)); CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); @@ -997,13 +1001,6 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_erfc(0.01f)); CALL_SUBTEST_4(test_cuda_erfc(0.001f)); - CALL_SUBTEST_4(test_cuda_lgamma(1.0)); - CALL_SUBTEST_4(test_cuda_lgamma(100.0)); - CALL_SUBTEST_4(test_cuda_lgamma(0.01)); - CALL_SUBTEST_4(test_cuda_lgamma(0.001)); - - CALL_SUBTEST_4(test_cuda_digamma()); - CALL_SUBTEST_4(test_cuda_erf(1.0)); CALL_SUBTEST_4(test_cuda_erf(100.0)); CALL_SUBTEST_4(test_cuda_erf(0.01)); @@ -1015,6 +1012,15 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_erfc(0.01)); CALL_SUBTEST_4(test_cuda_erfc(0.001)); + CALL_SUBTEST_5(test_cuda_digamma()); + CALL_SUBTEST_5(test_cuda_digamma()); + + CALL_SUBTEST_5(test_cuda_polygamma()); + CALL_SUBTEST_5(test_cuda_polygamma()); + + CALL_SUBTEST_5(test_cuda_zeta()); + CALL_SUBTEST_5(test_cuda_zeta()); + CALL_SUBTEST_5(test_cuda_igamma()); CALL_SUBTEST_5(test_cuda_igammac()); -- cgit v1.2.3 From 532fdf24cb8e0ec0ee546a8ba57fc3d75f138e9f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 17:11:31 -0700 Subject: Added support for hardware conversion between fp16 and full floats whenever possible. --- Eigen/Core | 5 +++++ Eigen/src/Core/arch/CUDA/Half.h | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index e44819383..1e62f3ec1 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -204,6 +204,11 @@ #endif #endif +#if defined(__F16C__) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C +#endif + #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 916812b61..0638dab5c 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -273,6 +273,12 @@ union FP32 { static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(ff); + +#elif defined(EIGEN_HAS_FP16_C) + __half h; + h.x = _cvtss_sh(ff, 0); + return h; + #else FP32 f; f.f = ff; @@ -321,6 +327,10 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + #else const FP32 magic = { 113 << 23 }; const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift -- cgit v1.2.3 From 14ea7c7ec7652886d474c8a51697d39c571367a1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 19:30:21 -0700 Subject: Fixed packet_traits --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 9e1d87062..14f0c9415 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -39,10 +39,6 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasErf = 1, - HasErfc = 1, HasBlend = 0, }; -- cgit v1.2.3 From df838736e2b59164a23236c07635187d2b9f60c2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Apr 2016 20:48:55 -0700 Subject: Fixed compilation warning triggered by msvc --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 0638dab5c..a2f46a898 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -83,7 +83,7 @@ struct half : public __half { EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { // +0.0 and -0.0 become false, everything else becomes true. - return static_cast(x & 0x7fff); + return (x & 0x7fff) != 0; } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { return static_cast(internal::half_to_float(*this)); -- cgit v1.2.3 From cfb34d808bd70efc046c55305bbe472e8e3c1e62 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 08:46:52 -0700 Subject: Fixed a possible integer overflow. --- unsupported/Eigen/src/Splines/SplineFitting.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h index 8e6a5aaed..c761a9b3d 100644 --- a/unsupported/Eigen/src/Splines/SplineFitting.h +++ b/unsupported/Eigen/src/Splines/SplineFitting.h @@ -130,12 +130,12 @@ namespace Eigen ParameterVectorType temporaryParameters(numParameters + 1); KnotVectorType derivativeKnots(numInternalDerivatives); - for (unsigned int i = 0; i < numAverageKnots - 1; ++i) + for (DenseIndex i = 0; i < numAverageKnots - 1; ++i) { temporaryParameters[0] = averageKnots[i]; ParameterVectorType parameterIndices(numParameters); int temporaryParameterIndex = 1; - for (int j = 0; j < numParameters; ++j) + for (DenseIndex j = 0; j < numParameters; ++j) { Scalar parameter = parameters[j]; if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1]) -- cgit v1.2.3 From 48308ed801fa8ee68116a8deb2c3d1567630d71f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 09:48:36 -0700 Subject: Added support for isinf, isnan, and isfinite checks to the tensor api --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 17 +++++++++++ unsupported/test/cxx11_tensor_cuda.cu | 38 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0729455fb..69d1802d5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -400,6 +400,23 @@ class TensorBase return operator!=(constant(threshold)); } + // Checks + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()); + } + // Coefficient-wise ternary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorSelectOp diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 33796690d..5f548ff0c 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -122,6 +122,43 @@ void test_cuda_elementwise() cudaFree(d_out); } +void test_cuda_props() { + Tensor in1(200); + Tensor out(200); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(bool); + + float* d_in1; + bool* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, 200); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, 200); + + gpu_out.device(gpu_device) = (gpu_in1.isnan)(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 200; ++i) { + VERIFY_IS_EQUAL(out(i), (std::isinf)(in1(i))); + } + + cudaFree(d_in1); + cudaFree(d_out); +} + void test_cuda_reduction() { Tensor in1(72,53,97,113); @@ -964,6 +1001,7 @@ void test_cxx11_tensor_cuda() { CALL_SUBTEST_1(test_cuda_elementwise_small()); CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_props()); CALL_SUBTEST_1(test_cuda_reduction()); CALL_SUBTEST_2(test_cuda_contraction()); CALL_SUBTEST_2(test_cuda_contraction()); -- cgit v1.2.3 From b89d3f78b21d267ec6f1118b7a8445b9f71d4613 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 10:08:49 -0700 Subject: Updated the isnan, isinf and isfinite functions to make compatible with cuda devices. --- Eigen/src/Core/MathFunctions.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e6c7dfa08..fd73f543b 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -23,7 +23,7 @@ double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } long double abs(long double x) { return (fabsl(x)); } #endif - + namespace internal { /** \internal \class global_math_functions_filtering_base @@ -704,7 +704,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (isfinite)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; return isfinite EIGEN_NOT_A_MACRO (x); #else @@ -717,7 +719,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (isinf)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; return isinf EIGEN_NOT_A_MACRO (x); #else @@ -730,7 +734,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (isnan)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; return isnan EIGEN_NOT_A_MACRO (x); #else @@ -780,9 +786,9 @@ template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return #endif // The following overload are defined at the end of this file -template bool isfinite_impl(const std::complex& x); -template bool isnan_impl(const std::complex& x); -template bool isinf_impl(const std::complex& x); +template EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x); +template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x); +template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); } // end namespace internal @@ -1089,19 +1095,19 @@ double fmod(const double& a, const double& b) { namespace internal { template -bool isfinite_impl(const std::complex& x) +EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x) { return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x)); } template -bool isnan_impl(const std::complex& x) +EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) { return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x)); } template -bool isinf_impl(const std::complex& x) +EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x) { return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x)); } -- cgit v1.2.3 From 8db269e055f214481e96de20c41f6b8659cddc5b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 10:41:51 -0700 Subject: Fixed a typo in a test --- unsupported/test/cxx11_tensor_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 5f548ff0c..b1bee6530 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -152,7 +152,7 @@ void test_cuda_props() { assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); for (int i = 0; i < 200; ++i) { - VERIFY_IS_EQUAL(out(i), (std::isinf)(in1(i))); + VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); } cudaFree(d_in1); -- cgit v1.2.3 From dc45aaeb93dfef6ee75671cbc57c600b60ea22b6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 11:18:05 -0700 Subject: Added tests for float16 --- unsupported/test/CMakeLists.txt | 2 + unsupported/test/float16.cpp | 147 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 unsupported/test/float16.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 7972e6776..96652bfcf 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,6 +110,8 @@ ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) +ei_add_test(float16) + if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/float16.cpp b/unsupported/test/float16.cpp new file mode 100644 index 000000000..13f3ddaca --- /dev/null +++ b/unsupported/test/float16.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC float16 + +#include "main.h" +#include + +using Eigen::half; + +void test_conversion() +{ + // Conversion from float. + VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); + VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); + VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); + VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); + VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); + VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. + + // Denormals. + VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); + VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); + VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); + + // Verify round-to-nearest-even behavior. + float val1 = float(half(__half{0x3c00})); + float val2 = float(half(__half{0x3c01})); + float val3 = float(half(__half{0x3c02})); + VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); + + // Conversion from int. + VERIFY_IS_EQUAL(half(-1).x, 0xbc00); + VERIFY_IS_EQUAL(half(0).x, 0x0000); + VERIFY_IS_EQUAL(half(1).x, 0x3c00); + VERIFY_IS_EQUAL(half(2).x, 0x4000); + VERIFY_IS_EQUAL(half(3).x, 0x4200); + + // Conversion from bool. + VERIFY_IS_EQUAL(half(false).x, 0x0000); + VERIFY_IS_EQUAL(half(true).x, 0x3c00); + + // Conversion to float. + VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f); + VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f); + + // Denormals. + VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f); + + // NaNs and infinities. + VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. + VERIFY(!(numext::isnan)(float(half(0.0f)))); + VERIFY((numext::isinf)(float(half(__half{0xfc00})))); + VERIFY((numext::isnan)(float(half(__half{0xfc01})))); + VERIFY((numext::isinf)(float(half(__half{0x7c00})))); + VERIFY((numext::isnan)(float(half(__half{0x7c01})))); + VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); + + // Exactly same checks as above, just directly on the half representation. + VERIFY(!(numext::isinf)(half(__half{0x7bff}))); + VERIFY(!(numext::isnan)(half(__half{0x0000}))); + VERIFY((numext::isinf)(half(__half{0xfc00}))); + VERIFY((numext::isnan)(half(__half{0xfc01}))); + VERIFY((numext::isinf)(half(__half{0x7c00}))); + VERIFY((numext::isnan)(half(__half{0x7c01}))); + VERIFY((numext::isnan)(half(0.0 / 0.0))); + VERIFY((numext::isinf)(half(1.0 / 0.0))); + VERIFY((numext::isinf)(half(-1.0 / 0.0))); +} + +void test_arithmetic() +{ + VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); + VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); + VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); + VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); + VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); + VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); + VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); +} + +void test_comparison() +{ + VERIFY(half(1.0f) > half(0.5f)); + VERIFY(half(0.5f) < half(1.0f)); + VERIFY(!(half(1.0f) < half(0.5f))); + VERIFY(!(half(0.5f) > half(1.0f))); + + VERIFY(!(half(4.0f) > half(4.0f))); + VERIFY(!(half(4.0f) < half(4.0f))); + + VERIFY(!(half(0.0f) < half(-0.0f))); + VERIFY(!(half(-0.0f) < half(0.0f))); + VERIFY(!(half(0.0f) > half(-0.0f))); + VERIFY(!(half(-0.0f) > half(0.0f))); + + VERIFY(half(0.2f) > half(-1.0f)); + VERIFY(half(-1.0f) < half(0.2f)); + VERIFY(half(-16.0f) < half(-15.0f)); + + VERIFY(half(1.0f) == half(1.0f)); + VERIFY(half(1.0f) != half(2.0f)); + + // Comparisons with NaNs and infinities. + VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); + VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); + + VERIFY(!(half(1.0) == half(0.0 / 0.0))); + VERIFY(!(half(1.0) < half(0.0 / 0.0))); + VERIFY(!(half(1.0) > half(0.0 / 0.0))); + VERIFY(half(1.0) != half(0.0 / 0.0)); + + VERIFY(half(1.0) < half(1.0 / 0.0)); + VERIFY(half(1.0) > half(-1.0 / 0.0)); +} + +void test_functions() +{ + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); + VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + + VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); + VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); + + VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); +} + +void test_float16() +{ + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_functions()); +} -- cgit v1.2.3 From 737644366fdec29d4f6720fb05c2e2943237cfbc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 11:40:15 -0700 Subject: Move the functions operating on fp16 out of the std namespace and into the Eigen::numext namespace --- Eigen/src/Core/arch/CUDA/Half.h | 53 ++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index a2f46a898..0a3b301bf 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -389,6 +389,29 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) return (a.x & 0x7fff) > 0x7c00; #endif } +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return Eigen::half(::logf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} } // end namespace numext @@ -428,36 +451,6 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { -// Import the standard mathematical functions and trancendentals into the -// into the std namespace. -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { - return ::fabsh(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { - return ::exph(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { - return ::logh(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { - return ::sqrth(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { - return ::floorh(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { - return ::ceilh(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) { - return (Eigen::numext::isnan)(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) { - return (Eigen::numext::isinf)(a); -} -static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { - return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); -} - #if __cplusplus > 199711L template <> struct hash { -- cgit v1.2.3 From 74f64838c5cb453a6a7d9d8e858ee2511b4db731 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 11:42:14 -0700 Subject: Updated the unary functors to use the numext implementation of typicall functions instead of the one provided in the standard library. The standard library functions aren't supported officially by cuda, so we're better off using the numext implementations. --- Eigen/src/Core/functors/UnaryFunctors.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 26c02e4a7..7ba0abedc 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -73,7 +73,7 @@ template struct abs_knowing_score EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) typedef typename NumTraits::Real result_type; template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); } }; template struct abs_knowing_score::Score_is_abs> { @@ -230,7 +230,7 @@ struct functor_traits > */ template struct scalar_exp_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; @@ -246,7 +246,7 @@ struct functor_traits > */ template struct scalar_log_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; @@ -276,7 +276,7 @@ struct functor_traits > */ template struct scalar_sqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; @@ -294,7 +294,7 @@ struct functor_traits > */ template struct scalar_rsqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } }; @@ -814,9 +814,8 @@ struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { - using std::abs; typedef typename NumTraits::Real real_type; - real_type aa = abs(a); + real_type aa = numext::abs(a); if (aa==0) return Scalar(0); aa = 1./aa; -- cgit v1.2.3 From c912b1d28cd7017392234972ffa34d1707d59188 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 11:51:07 -0700 Subject: Fixed a typo in the polygamma test. --- unsupported/test/cxx11_tensor_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index b1bee6530..233acb528 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -776,7 +776,7 @@ void test_cuda_polygamma() Eigen::TensorMap > gpu_in_n(d_in_n, 7); Eigen::TensorMap > gpu_out(d_out, 7); - gpu_out.device(gpu_device) = gpu_in_n.zeta(gpu_in_x); + gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); -- cgit v1.2.3 From a02ec09511014b4c5fa0be97bfe3a6d3591f730f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 12:11:02 -0700 Subject: Worked around numerical noise in the test for the zeta function. --- unsupported/test/cxx11_tensor_cuda.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 233acb528..134359611 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -718,9 +718,12 @@ void test_cuda_zeta() assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); VERIFY_IS_EQUAL(out(0), expected_out(0)); + VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3)); for (int i = 1; i < 6; ++i) { - VERIFY_IS_APPROX(out(i), expected_out(i)); + if (i != 3) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } } } -- cgit v1.2.3 From 2d5bb375b7d2d2e1839419264b2779b420cdbd79 Mon Sep 17 00:00:00 2001 From: parthaEth Date: Fri, 8 Apr 2016 00:14:44 +0200 Subject: Static casting scalar types so as to let chlesky module of eigen work with ceres --- Eigen/src/Cholesky/LDLT.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index c3cc3746c..bd4902987 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -266,8 +266,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef; - else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef; + if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; } @@ -324,12 +324,12 @@ template<> struct ldlt_inplace A21 /= realAkk; if (sign == PositiveSemiDef) { - if (realAkk < 0) sign = Indefinite; + if (realAkk < static_cast(0)) sign = Indefinite; } else if (sign == NegativeSemiDef) { - if (realAkk > 0) sign = Indefinite; + if (realAkk > static_cast(0)) sign = Indefinite; } else if (sign == ZeroSign) { - if (realAkk > 0) sign = PositiveSemiDef; - else if (realAkk < 0) sign = NegativeSemiDef; + if (realAkk > static_cast(0)) sign = PositiveSemiDef; + else if (realAkk < static_cast(0)) sign = NegativeSemiDef; } } -- cgit v1.2.3 From a6d08be9b2e1b9e11a488419b7dd0affcc321a32 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 17:13:44 -0700 Subject: Fixed the benchmarking of fp16 coefficient wise operations --- bench/tensors/tensor_benchmarks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index a4f97728d..16b388abf 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -248,7 +248,7 @@ template class BenchmarkSuite { StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); } // Record the number of FLOP executed per second (2 multiplications and // 1 addition per value) -- cgit v1.2.3 From 7d5b17087f6a54fab94decaaa9046ff21fa4683a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 20:01:19 -0700 Subject: Added missing EIGEN_DEVICE_FUNC to the tensor conversion code. --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index f2dee3ee8..a96776a77 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -87,6 +87,7 @@ struct PacketConverter { template struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} -- cgit v1.2.3 From d962fe6a9980065456718b1ab41750dcb76f8971 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 20:28:32 -0700 Subject: Renamed float16 into cxx11_float16 since the test relies on c++11 features --- unsupported/test/CMakeLists.txt | 3 +- unsupported/test/cxx11_float16.cpp | 147 +++++++++++++++++++++++++++++++++++++ unsupported/test/float16.cpp | 147 ------------------------------------- 3 files changed, 148 insertions(+), 149 deletions(-) create mode 100644 unsupported/test/cxx11_float16.cpp delete mode 100644 unsupported/test/float16.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 96652bfcf..c6a92fe73 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,13 +110,12 @@ ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) -ei_add_test(float16) - if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. set(CMAKE_CXX_STANDARD 11) + ei_add_test(cxx11_float16) ei_add_test(cxx11_meta) ei_add_test(cxx11_tensor_simple) # ei_add_test(cxx11_tensor_symmetry) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp new file mode 100644 index 000000000..44ffa67b3 --- /dev/null +++ b/unsupported/test/cxx11_float16.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_float16 + +#include "main.h" +#include + +using Eigen::half; + +void test_conversion() +{ + // Conversion from float. + VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); + VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); + VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); + VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); + VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); + VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. + + // Denormals. + VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); + VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); + VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); + + // Verify round-to-nearest-even behavior. + float val1 = float(half(__half{0x3c00})); + float val2 = float(half(__half{0x3c01})); + float val3 = float(half(__half{0x3c02})); + VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); + + // Conversion from int. + VERIFY_IS_EQUAL(half(-1).x, 0xbc00); + VERIFY_IS_EQUAL(half(0).x, 0x0000); + VERIFY_IS_EQUAL(half(1).x, 0x3c00); + VERIFY_IS_EQUAL(half(2).x, 0x4000); + VERIFY_IS_EQUAL(half(3).x, 0x4200); + + // Conversion from bool. + VERIFY_IS_EQUAL(half(false).x, 0x0000); + VERIFY_IS_EQUAL(half(true).x, 0x3c00); + + // Conversion to float. + VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f); + VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f); + + // Denormals. + VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f); + + // NaNs and infinities. + VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. + VERIFY(!(numext::isnan)(float(half(0.0f)))); + VERIFY((numext::isinf)(float(half(__half{0xfc00})))); + VERIFY((numext::isnan)(float(half(__half{0xfc01})))); + VERIFY((numext::isinf)(float(half(__half{0x7c00})))); + VERIFY((numext::isnan)(float(half(__half{0x7c01})))); + VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); + + // Exactly same checks as above, just directly on the half representation. + VERIFY(!(numext::isinf)(half(__half{0x7bff}))); + VERIFY(!(numext::isnan)(half(__half{0x0000}))); + VERIFY((numext::isinf)(half(__half{0xfc00}))); + VERIFY((numext::isnan)(half(__half{0xfc01}))); + VERIFY((numext::isinf)(half(__half{0x7c00}))); + VERIFY((numext::isnan)(half(__half{0x7c01}))); + VERIFY((numext::isnan)(half(0.0 / 0.0))); + VERIFY((numext::isinf)(half(1.0 / 0.0))); + VERIFY((numext::isinf)(half(-1.0 / 0.0))); +} + +void test_arithmetic() +{ + VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); + VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); + VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); + VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); + VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); + VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); + VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); +} + +void test_comparison() +{ + VERIFY(half(1.0f) > half(0.5f)); + VERIFY(half(0.5f) < half(1.0f)); + VERIFY(!(half(1.0f) < half(0.5f))); + VERIFY(!(half(0.5f) > half(1.0f))); + + VERIFY(!(half(4.0f) > half(4.0f))); + VERIFY(!(half(4.0f) < half(4.0f))); + + VERIFY(!(half(0.0f) < half(-0.0f))); + VERIFY(!(half(-0.0f) < half(0.0f))); + VERIFY(!(half(0.0f) > half(-0.0f))); + VERIFY(!(half(-0.0f) > half(0.0f))); + + VERIFY(half(0.2f) > half(-1.0f)); + VERIFY(half(-1.0f) < half(0.2f)); + VERIFY(half(-16.0f) < half(-15.0f)); + + VERIFY(half(1.0f) == half(1.0f)); + VERIFY(half(1.0f) != half(2.0f)); + + // Comparisons with NaNs and infinities. + VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); + VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); + + VERIFY(!(half(1.0) == half(0.0 / 0.0))); + VERIFY(!(half(1.0) < half(0.0 / 0.0))); + VERIFY(!(half(1.0) > half(0.0 / 0.0))); + VERIFY(half(1.0) != half(0.0 / 0.0)); + + VERIFY(half(1.0) < half(1.0 / 0.0)); + VERIFY(half(1.0) > half(-1.0 / 0.0)); +} + +void test_functions() +{ + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); + VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + + VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); + VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); + + VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); +} + +void test_cxx11_float16() +{ + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_functions()); +} diff --git a/unsupported/test/float16.cpp b/unsupported/test/float16.cpp deleted file mode 100644 index 13f3ddaca..000000000 --- a/unsupported/test/float16.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC float16 - -#include "main.h" -#include - -using Eigen::half; - -void test_conversion() -{ - // Conversion from float. - VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); - VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); - VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); - VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); - VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); - VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. - - // Denormals. - VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); - VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); - VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); - - // Verify round-to-nearest-even behavior. - float val1 = float(half(__half{0x3c00})); - float val2 = float(half(__half{0x3c01})); - float val3 = float(half(__half{0x3c02})); - VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); - - // Conversion from int. - VERIFY_IS_EQUAL(half(-1).x, 0xbc00); - VERIFY_IS_EQUAL(half(0).x, 0x0000); - VERIFY_IS_EQUAL(half(1).x, 0x3c00); - VERIFY_IS_EQUAL(half(2).x, 0x4000); - VERIFY_IS_EQUAL(half(3).x, 0x4200); - - // Conversion from bool. - VERIFY_IS_EQUAL(half(false).x, 0x0000); - VERIFY_IS_EQUAL(half(true).x, 0x3c00); - - // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f); - VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f); - - // Denormals. - VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f); - - // NaNs and infinities. - VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. - VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half{0xfc00})))); - VERIFY((numext::isnan)(float(half(__half{0xfc01})))); - VERIFY((numext::isinf)(float(half(__half{0x7c00})))); - VERIFY((numext::isnan)(float(half(__half{0x7c01})))); - VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); - - // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half{0x7bff}))); - VERIFY(!(numext::isnan)(half(__half{0x0000}))); - VERIFY((numext::isinf)(half(__half{0xfc00}))); - VERIFY((numext::isnan)(half(__half{0xfc01}))); - VERIFY((numext::isinf)(half(__half{0x7c00}))); - VERIFY((numext::isnan)(half(__half{0x7c01}))); - VERIFY((numext::isnan)(half(0.0 / 0.0))); - VERIFY((numext::isinf)(half(1.0 / 0.0))); - VERIFY((numext::isinf)(half(-1.0 / 0.0))); -} - -void test_arithmetic() -{ - VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); - VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); - VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); - VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); - VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); - VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); - VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); -} - -void test_comparison() -{ - VERIFY(half(1.0f) > half(0.5f)); - VERIFY(half(0.5f) < half(1.0f)); - VERIFY(!(half(1.0f) < half(0.5f))); - VERIFY(!(half(0.5f) > half(1.0f))); - - VERIFY(!(half(4.0f) > half(4.0f))); - VERIFY(!(half(4.0f) < half(4.0f))); - - VERIFY(!(half(0.0f) < half(-0.0f))); - VERIFY(!(half(-0.0f) < half(0.0f))); - VERIFY(!(half(0.0f) > half(-0.0f))); - VERIFY(!(half(-0.0f) > half(0.0f))); - - VERIFY(half(0.2f) > half(-1.0f)); - VERIFY(half(-1.0f) < half(0.2f)); - VERIFY(half(-16.0f) < half(-15.0f)); - - VERIFY(half(1.0f) == half(1.0f)); - VERIFY(half(1.0f) != half(2.0f)); - - // Comparisons with NaNs and infinities. - VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); - VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); - - VERIFY(!(half(1.0) == half(0.0 / 0.0))); - VERIFY(!(half(1.0) < half(0.0 / 0.0))); - VERIFY(!(half(1.0) > half(0.0 / 0.0))); - VERIFY(half(1.0) != half(0.0 / 0.0)); - - VERIFY(half(1.0) < half(1.0 / 0.0)); - VERIFY(half(1.0) > half(-1.0 / 0.0)); -} - -void test_functions() -{ - VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); - VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); - - VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); - - VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); -} - -void test_float16() -{ - CALL_SUBTEST(test_conversion()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_functions()); -} -- cgit v1.2.3 From 7c47d3e663375743e28bfd8863af1091f3b7c8b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 22:50:25 -0700 Subject: Fixed the type casting benchmarks for fp16 --- bench/tensors/tensor_benchmarks.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 16b388abf..90b9bc741 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -46,8 +46,13 @@ template class BenchmarkSuite { void typeCasting(int num_iters) { eigen_assert(m_ == n_); Eigen::array sizes; - sizes[0] = m_; - sizes[1] = k_; + if (sizeof(T) >= sizeof(int)) { + sizes[0] = m_; + sizes[1] = k_; + } else { + sizes[0] = m_ * sizeof(T) / sizeof(int); + sizes[1] = k_ * sizeof(T) / sizeof(int); + } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); -- cgit v1.2.3 From 63102ee43d3f0507b75a95a3b8ee6ced92316322 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 23:05:20 -0700 Subject: Turn on the coeffWise benchmarks on fp16 --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 35c6f7489..d34bd73ca 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -28,7 +28,7 @@ BM_FuncGPU(shuffling); BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); -//BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(coeffWiseOp); //BM_FuncGPU(algebraicFunc); //BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); -- cgit v1.2.3 From 3bd16457e14a79f0293e5cec29e22f7c62a3a359 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 23:28:04 -0700 Subject: Properly handle complex numbers. --- Eigen/src/Cholesky/LDLT.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index bd4902987..1d767d5c8 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -266,8 +266,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; - else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; + if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; } @@ -324,12 +324,12 @@ template<> struct ldlt_inplace A21 /= realAkk; if (sign == PositiveSemiDef) { - if (realAkk < static_cast(0)) sign = Indefinite; + if (realAkk < static_cast(0)) sign = Indefinite; } else if (sign == NegativeSemiDef) { - if (realAkk > static_cast(0)) sign = Indefinite; + if (realAkk > static_cast(0)) sign = Indefinite; } else if (sign == ZeroSign) { - if (realAkk > static_cast(0)) sign = PositiveSemiDef; - else if (realAkk < static_cast(0)) sign = NegativeSemiDef; + if (realAkk > static_cast(0)) sign = PositiveSemiDef; + else if (realAkk < static_cast(0)) sign = NegativeSemiDef; } } -- cgit v1.2.3 From cd2b667ac80adc54df1496273a3cbe83d2eb1607 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 8 Apr 2016 08:12:47 -0400 Subject: Add references to filed LLVM bugs --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 63a2d9f52..3224c36bd 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -179,6 +179,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, // then implements a slow software scalar fallback calling fmaf()! +// Filed LLVM bug: +// https://llvm.org/bugs/show_bug.cgi?id=27216 #if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) // See bug 936. // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. @@ -195,6 +197,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& // -march=armv7-a, that is a very common case. // See e.g. this thread: // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + // Filed LLVM bug: + // https://llvm.org/bugs/show_bug.cgi?id=27219 Packet4f r = c; asm volatile( "vmla.f32 %q[r], %q[a], %q[b]" -- cgit v1.2.3 From 2d072b38c134c60fb796e36466fd4e7a1888ce35 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 12:50:25 -0700 Subject: Don't test the division by 0 on float16 when compiling with msvc since msvc detects and errors out on divisions by 0. --- unsupported/test/cxx11_float16.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 44ffa67b3..2dc0872d8 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -64,9 +64,13 @@ void test_conversion() VERIFY((numext::isnan)(float(half(__half{0xfc01})))); VERIFY((numext::isinf)(float(half(__half{0x7c00})))); VERIFY((numext::isnan)(float(half(__half{0x7c01})))); + +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); +#endif // Exactly same checks as above, just directly on the half representation. VERIFY(!(numext::isinf)(half(__half{0x7bff}))); @@ -75,9 +79,13 @@ void test_conversion() VERIFY((numext::isnan)(half(__half{0xfc01}))); VERIFY((numext::isinf)(half(__half{0x7c00}))); VERIFY((numext::isnan)(half(__half{0x7c01}))); + +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 VERIFY((numext::isnan)(half(0.0 / 0.0))); VERIFY((numext::isinf)(half(1.0 / 0.0))); VERIFY((numext::isinf)(half(-1.0 / 0.0))); +#endif } void test_arithmetic() -- cgit v1.2.3 From 0d2a532fc3b25199af03106b6d4ade0f92a30dfc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 13:16:08 -0700 Subject: Created the new EIGEN_TEST_CUDA_CLANG option to compile the CUDA tests using clang instead of nvcc --- cmake/EigenTesting.cmake | 27 +++++++++++++++++++++++---- test/CMakeLists.txt | 11 ++++++++++- unsupported/test/CMakeLists.txt | 4 ++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index d5e3972b5..6f3661921 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,10 +19,25 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - if (${ARGC} GREATER 2) - cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + if(EIGEN_TEST_CUDA_CLANG) + set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) + if(CUDA_64_BIT_DEVICE_CODE) + link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") + else() + link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib") + endif() + if (${ARGC} GREATER 2) + add_executable(${targetname} ${filename}) + else() + add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + endif() + target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread") else() - cuda_add_executable(${targetname} ${filename}) + if (${ARGC} GREATER 2) + cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + else() + cuda_add_executable(${targetname} ${filename}) + endif() endif() else() add_executable(${targetname} ${filename}) @@ -316,7 +331,11 @@ macro(ei_testing_print_summary) endif() if(EIGEN_TEST_CUDA) - message(STATUS "CUDA: ON") + if(EIGEN_TEST_CUDA_CLANG) + message(STATUS "CUDA: ON (using clang)") + else() + message(STATUS "CUDA: ON (using nvcc)") + endif() else() message(STATUS "CUDA: OFF") endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 841c4572b..7bed6a45c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -327,8 +327,14 @@ endif() # CUDA unit tests option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF) +option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF) + +if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang") + message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.") +endif() + if(EIGEN_TEST_CUDA) - + find_package(CUDA 5.0) if(CUDA_FOUND) @@ -336,6 +342,9 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() + if(EIGEN_TEST_CUDA_CLANG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30") + endif() cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c6a92fe73..b1931d80a 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -190,6 +190,10 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() + if(EIGEN_TEST_CUDA_CLANG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}") + endif() + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") -- cgit v1.2.3 From 3394379319eb7e7946662142d1ac6bf733a5ae28 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 13:33:59 -0700 Subject: Fixed the packet_traits for half floats. --- Eigen/src/Core/GenericPacketMath.h | 8 ++++---- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 9 +-------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6ff61c18a..001c2ffbf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -62,7 +62,7 @@ struct default_packet_traits HasRsqrt = 0, HasExp = 0, HasLog = 0, - HasLog10 = 0, + HasLog10 = 0, HasPow = 0, HasSin = 0, @@ -71,9 +71,9 @@ struct default_packet_traits HasASin = 0, HasACos = 0, HasATan = 0, - HasSinh = 0, - HasCosh = 0, - HasTanh = 0, + HasSinh = 0, + HasCosh = 0, + HasTanh = 0, HasLGamma = 0, HasDiGamma = 0, HasZeta = 0, diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 14f0c9415..dc09c74d1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -33,14 +33,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=2, HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - - HasBlend = 0, + HasDiv = 1 }; }; -- cgit v1.2.3 From 8d22967bd9a7963e72622c8fb17cc5322f938f9f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 14:22:39 -0700 Subject: Initial support for taking the power of fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 0a3b301bf..3be7e88d7 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -406,6 +406,9 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::ha template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { return Eigen::half(::sqrtf(float(a))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } @@ -432,6 +435,9 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { return Eigen::half(::sqrtf(float(a))); } +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } -- cgit v1.2.3 From 995f202cea0677264405496c1ef0c3a8570f7dda Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 14:43:36 -0700 Subject: Disabled the use of half2 on cuda devices of compute capability < 5.3 --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 14 ++------------ Eigen/src/Core/arch/CUDA/TypeCasting.h | 25 ++----------------------- unsupported/test/CMakeLists.txt | 5 +---- 3 files changed, 5 insertions(+), 39 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index dc09c74d1..61d532e4d 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,7 +17,8 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +// Most of the following operations require arch >= 5.3 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 namespace Eigen { namespace internal { @@ -67,20 +68,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, co template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { -#if __CUDA_ARCH__ >= 320 return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { -#if __CUDA_ARCH__ >= 320 return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif } template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { @@ -113,8 +106,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -// The following operations require arch >= 5.3 -#if __CUDA_ARCH__ >= 530 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); } @@ -190,7 +181,6 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { return __hmul(__low2half(a), __high2half(a)); } -#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index b2a9724de..396b38eaf 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -71,6 +71,7 @@ struct functor_traits > +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 template <> struct type_casting_traits { @@ -82,22 +83,9 @@ struct type_casting_traits { }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float2 r1 = __half22float2(a); float2 r2 = __half22float2(b); return make_float4(r1.x, r1.y, r2.x, r2.y); -#else - half r1; - r1.x = a.x & 0xFFFF; - half r2; - r2.x = (a.x & 0xFFFF0000) >> 16; - half r3; - r3.x = b.x & 0xFFFF; - half r4; - r4.x = (b.x & 0xFFFF0000) >> 16; - return make_float4(static_cast(r1), static_cast(r2), - static_cast(r3), static_cast(r4)); -#endif } template <> @@ -111,19 +99,10 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { // Simply discard the second half of the input -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); -#else - half r1 = static_cast(a.x); - half r2 = static_cast(a.y); - half2 r; - r.x = 0; - r.x |= r1.x; - r.x |= (static_cast(r2.x) << 16); - return r; -#endif } +#endif #endif } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index b1931d80a..c088df1c1 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -210,10 +210,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) ei_add_test(cxx11_tensor_random_cuda) endif() - # Operations other that casting of half floats are only supported starting with arch 5.3 - if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52) - ei_add_test(cxx11_tensor_of_float16_cuda) - endif() + ei_add_test(cxx11_tensor_of_float16_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() -- cgit v1.2.3 From 89a3dc35a339eee62635a133b2c351ce45011419 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 15:56:16 -0700 Subject: Fixed isfinite_impl: NumTraits::highest() and NumTraits::lowest() are finite numbers. --- Eigen/src/Core/MathFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index fd73f543b..2f66c7463 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -710,7 +710,7 @@ isfinite_impl(const T& x) using std::isfinite; return isfinite EIGEN_NOT_A_MACRO (x); #else - return x::highest() && x>NumTraits::lowest(); + return x<=NumTraits::highest() && x>=NumTraits::lowest(); #endif } -- cgit v1.2.3 From 01bd577288bb737ea25485eed1a25d7d3433b01f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 16:40:10 -0700 Subject: Fixed the implementation of Eigen::numext::isfinite, Eigen::numext::isnan, andEigen::numext::isinf on CUDA devices --- Eigen/src/Core/MathFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 2f66c7463..dd19f080b 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -705,7 +705,7 @@ typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits: isfinite_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isfinite)(x); + return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; return isfinite EIGEN_NOT_A_MACRO (x); @@ -720,7 +720,7 @@ typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits: isinf_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isinf)(x); + return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; return isinf EIGEN_NOT_A_MACRO (x); @@ -735,7 +735,7 @@ typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits: isnan_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isnan)(x); + return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; return isnan EIGEN_NOT_A_MACRO (x); -- cgit v1.2.3 From 5da90fc8dd1570ebfbc0a9b6c058207b3bec15b6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Apr 2016 19:40:48 -0700 Subject: Use numext::abs instead of std::abs in scalar_fuzzy_default_impl to make it usable inside GPU kernels. --- Eigen/src/Core/MathFunctions.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index dd19f080b..8e7dd2b73 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1128,14 +1128,12 @@ struct scalar_fuzzy_default_impl template EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x) <= abs(y) * prec; + return numext::abs(x) <= numext::abs(y) * prec; } EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec; + return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec; } EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec) -- cgit v1.2.3 From a05a683d8329918c29efd931981fc1ead5b6deea Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 9 Apr 2016 10:49:19 +0200 Subject: bug #1160: fix and relax some lm unit tests by turning faillures to warnings --- unsupported/test/levenberg_marquardt.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index a2bdb99e4..65afa0f78 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -789,7 +789,8 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); @@ -799,9 +800,13 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - //VERIFY_IS_EQUAL(info, 1); + + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 284 ); VERIFY_IS_EQUAL(lm.njev(), 249 ); + --g_test_level; + VERIFY(lm.nfev() < 284 * 3/2); + VERIFY(lm.njev() < 249 * 3/2); /* * Second try @@ -809,7 +814,10 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); + ++g_test_level; VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // was: VERIFY_IS_EQUAL(info, 1); + --g_test_level; // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); @@ -819,9 +827,12 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - //VERIFY_IS_EQUAL(info, 1); + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 126); VERIFY_IS_EQUAL(lm.njev(), 116); + --g_test_level; + VERIFY(lm.nfev() < 126 * 3/2); + VERIFY(lm.njev() < 116 * 3/2); } @@ -896,8 +907,12 @@ void testNistBoxBOD(void) // check return value VERIFY_IS_EQUAL(info, 1); + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 16 ); VERIFY_IS_EQUAL(lm.njev(), 15 ); + --g_test_level; + VERIFY(lm.nfev() < 16 * 3/2); + VERIFY(lm.njev() < 15 * 3/2); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03); // check x -- cgit v1.2.3 From af2161cdb4ec19fbc44bcf7bca7cae662b6b8085 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 9 Apr 2016 11:14:02 +0200 Subject: bug #1197: fix/relax some LM unit tests --- unsupported/test/NonLinearOptimization.cpp | 16 ++++++++++++---- unsupported/test/levenberg_marquardt.cpp | 19 +++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index 724ea7b5b..6a5ed057f 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -14,6 +14,9 @@ using std::sqrt; +// tolerance for chekcing number of iterations +#define LM_EVAL_COUNT_TOL 4/3 + int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag) { /* subroutine fcn for chkder example. */ @@ -1023,7 +1026,8 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.njev, 72); // check norm^2 std::cout.precision(30); - VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4290986055242372e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + std::cout << lm.fvec.squaredNorm() << "\n"; + VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -1044,7 +1048,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev, 9); VERIFY_IS_EQUAL(lm.njev, 8); // check norm^2 - VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430571737783119393e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -1354,8 +1358,12 @@ void testNistMGH17(void) // check return value VERIFY_IS_EQUAL(info, 2); - VERIFY(lm.nfev < 650); // 602 - VERIFY(lm.njev < 600); // 545 + ++g_test_level; + VERIFY_IS_EQUAL(lm.nfev, 602); // 602 + VERIFY_IS_EQUAL(lm.njev, 545); // 545 + --g_test_level; + VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL); /* * Second try diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 65afa0f78..6dc17bd17 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -23,6 +23,9 @@ using std::sqrt; +// tolerance for chekcing number of iterations +#define LM_EVAL_COUNT_TOL 4/3 + struct lmder_functor : DenseFunctor { lmder_functor(void): DenseFunctor(3,15) {} @@ -631,7 +634,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev(), 79); VERIFY_IS_EQUAL(lm.njev(), 72); // check norm^2 -// VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -652,7 +655,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev(), 9); VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 -// VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -805,8 +808,8 @@ void testNistMGH10(void) VERIFY_IS_EQUAL(lm.nfev(), 284 ); VERIFY_IS_EQUAL(lm.njev(), 249 ); --g_test_level; - VERIFY(lm.nfev() < 284 * 3/2); - VERIFY(lm.njev() < 249 * 3/2); + VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL); /* * Second try @@ -831,8 +834,8 @@ void testNistMGH10(void) VERIFY_IS_EQUAL(lm.nfev(), 126); VERIFY_IS_EQUAL(lm.njev(), 116); --g_test_level; - VERIFY(lm.nfev() < 126 * 3/2); - VERIFY(lm.njev() < 116 * 3/2); + VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL); } @@ -911,8 +914,8 @@ void testNistBoxBOD(void) VERIFY_IS_EQUAL(lm.nfev(), 16 ); VERIFY_IS_EQUAL(lm.njev(), 15 ); --g_test_level; - VERIFY(lm.nfev() < 16 * 3/2); - VERIFY(lm.njev() < 15 * 3/2); + VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03); // check x -- cgit v1.2.3 From de057ebe541d5a6c1297ea94a89dcaf35582d44e Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Sat, 9 Apr 2016 20:07:36 +0100 Subject: Added nans to zeta function. --- Eigen/src/Core/SpecialFunctions.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 2a0a6ff15..954972cdd 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -881,13 +881,14 @@ struct zeta_impl { const Scalar maxnum = NumTraits::infinity(); const Scalar zero = 0.0, half = 0.5, one = 1.0; const Scalar machep = igamma_helper::machep(); + const Scalar nan = NumTraits::quiet_NaN(); if( x == one ) return maxnum; if( x < one ) { - return zero; + return nan; } if( q <= zero ) @@ -899,7 +900,7 @@ struct zeta_impl { p = x; r = numext::floor(p); if (p != r) - return zero; + return nan; } /* Permit negative q but continue sum until n+q > +9 . -- cgit v1.2.3 From 643b6976493c122ffb7205cc3ab893f28f9e1634 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Sun, 10 Apr 2016 00:37:53 +0100 Subject: Proper handling of domain errors. --- Eigen/src/Core/SpecialFunctions.h | 7 ++++++- test/array.cpp | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 954972cdd..2dc7b22fc 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -970,9 +970,14 @@ struct polygamma_impl { static Scalar run(Scalar n, Scalar x) { Scalar zero = 0.0, one = 1.0; Scalar nplus = n + one; + const Scalar nan = NumTraits::quiet_NaN(); + // Check that n is an integer + if (numext::floor(n) != n) { + return nan; + } // Just return the digamma function for n = 1 - if (n == zero) { + else if (n == zero) { return digamma_impl::run(x); } // Use the same implementation as scipy diff --git a/test/array.cpp b/test/array.cpp index 8b0a34722..beaa62221 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -331,11 +331,13 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(numext::zeta(Scalar(3), Scalar(-2.5)), RealScalar(0.054102025820864097)); VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter std::numeric_limits::infinity()); + VERIFY((numext::isnan)(numext::zeta(Scalar(0.9), Scalar(1.2345)))); // The second scalar does not matter // Check the polygamma against scipy.special.polygamma examples VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(2)), RealScalar(0.644934066848)); VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(3)), RealScalar(0.394934066848)); VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(25.5)), RealScalar(0.0399946696496)); + VERIFY((numext::isnan)(numext::polygamma(Scalar(1.5), Scalar(1.2345)))); // The second scalar does not matter // Check the polygamma function over a larger range of values VERIFY_IS_APPROX(numext::polygamma(Scalar(17), Scalar(4.7)), RealScalar(293.334565435)); -- cgit v1.2.3 From fc6a0ebb1c98ab51c575bcd2688c1d9d11200267 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 10:54:58 +0200 Subject: Typos in doc. --- doc/TutorialReshapeSlicing.dox | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/TutorialReshapeSlicing.dox b/doc/TutorialReshapeSlicing.dox index eb0fb0df0..3730a5de6 100644 --- a/doc/TutorialReshapeSlicing.dox +++ b/doc/TutorialReshapeSlicing.dox @@ -37,10 +37,10 @@ Here is another example reshaping a 2x6 matrix to a 6x2 one: \section TutorialSlicing Slicing -Slicing consists in taking a set of rows, or columns, or elements, uniformly spaced within a matrix. +Slicing consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix. Again, the class Map allows to easily mimic this feature. -For instance, one can take skip every P elements in a vector: +For instance, one can skip every P elements in a vector:
Example:Output:
-- cgit v1.2.3 From 675e0a222442b1d7446a843f15128c467502160a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 15:06:20 +0200 Subject: Fix static/inline keywords order. --- Eigen/src/Core/AssignEvaluator.h | 8 +++---- Eigen/src/Core/SpecialFunctions.h | 48 +++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index a9a524130..3de8aa9a2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -788,8 +788,8 @@ template void check_for_aliasing(const Dst &dst, con template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -806,8 +806,8 @@ struct Assignment template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); src.evalTo(dst); diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 2dc7b22fc..adb055b15 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -79,8 +79,8 @@ namespace cephes { */ template struct polevl { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) { EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); return polevl::run(x, coef) * x + coef[N]; @@ -89,8 +89,8 @@ struct polevl { template struct polevl { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar run(const Scalar, const Scalar coef[]) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) { return coef[0]; } }; @@ -144,7 +144,7 @@ struct digamma_retval { template struct digamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -428,20 +428,20 @@ template struct igamma_impl; // predeclare igamma_impl template struct igamma_helper { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar big() { assert(false && "big not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } }; template <> struct igamma_helper { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static float machep() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static float big() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { // use epsneg (1.0 - epsneg == 1.0) return 1.0 / (NumTraits::epsilon() / 2); } @@ -449,12 +449,12 @@ struct igamma_helper { template <> struct igamma_helper { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static double machep() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static double big() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { return 1.0 / NumTraits::epsilon(); } }; @@ -605,7 +605,7 @@ struct igamma_retval { template struct igamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar a, Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -736,7 +736,7 @@ struct zeta_retval { template struct zeta_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar x, Scalar q) { + static EIGEN_STRONG_INLINE Scalar run(Scalar x, Scalar q) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -757,8 +757,8 @@ struct zeta_impl_series { template <> struct zeta_impl_series { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static bool run(float& a, float& b, float& s, const float x, const float machep) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { int i = 0; while(i < 9) { @@ -777,8 +777,8 @@ struct zeta_impl_series { template <> struct zeta_impl_series { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static bool run(double& a, double& b, double& s, const double x, const double machep) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { int i = 0; while( (i < 9) || (a <= 9.0) ) { @@ -955,7 +955,7 @@ struct polygamma_retval { template struct polygamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar n, Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); -- cgit v1.2.3 From 4e8e5888d7a78d514e54a518f6692f2838314328 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 15:12:44 +0200 Subject: Improve constness of blas level-3 interface. --- Eigen/src/misc/blas.h | 266 ++++++++++++++++++-------------------------------- blas/common.h | 19 ++++ blas/level3_impl.h | 81 ++++++++------- 3 files changed, 158 insertions(+), 208 deletions(-) diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h index 6fce99ed5..ae0c393f1 100644 --- a/Eigen/src/misc/blas.h +++ b/Eigen/src/misc/blas.h @@ -30,15 +30,15 @@ int BLASFUNC(cdotcw) (int *, float *, int *, float *, int *, float*); int BLASFUNC(zdotuw) (int *, double *, int *, double *, int *, double*); int BLASFUNC(zdotcw) (int *, double *, int *, double *, int *, double*); -int BLASFUNC(saxpy) (int *, float *, float *, int *, float *, int *); -int BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(caxpy) (int *, float *, float *, int *, float *, int *); -int BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(caxpyc)(int *, float *, float *, int *, float *, int *); -int BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *); -int BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *); +int BLASFUNC(saxpy) (const int *, const float *, const float *, const int *, float *, int *); +int BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(caxpy) (const int *, const float *, const float *, const int *, float *, int *); +int BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(caxpyc)(const int *, const float *, const float *, const int *, float *, int *); +int BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, int *); int BLASFUNC(scopy) (int *, float *, int *, float *, int *); int BLASFUNC(dcopy) (int *, double *, int *, double *, int *); @@ -177,31 +177,19 @@ int BLASFUNC(xgeru)(int *, int *, double *, double *, int *, int BLASFUNC(xgerc)(int *, int *, double *, double *, int *, double *, int *, double *, int *); -int BLASFUNC(sgemv)(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(cgemv)(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -int BLASFUNC(strsv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(ctrsv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); +int BLASFUNC(strsv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrsv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(stpsv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *); @@ -210,18 +198,12 @@ int BLASFUNC(ctpsv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *); int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *); -int BLASFUNC(strmv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(ctrmv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); +int BLASFUNC(strmv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrmv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(stpmv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *); @@ -244,18 +226,12 @@ int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float *, int *, floa int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *); int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *); -int BLASFUNC(ssymv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(csymv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(ssymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(csymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(sspmv) (char *, int *, float *, float *, float *, int *, float *, float *, int *); @@ -347,12 +323,9 @@ int BLASFUNC(zhpr2) (char *, int *, double *, int BLASFUNC(xhpr2) (char *, int *, double *, double *, int *, double *, int *, double *); -int BLASFUNC(chemv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zhemv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xhemv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(chemv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(chpmv) (char *, int *, float *, float *, float *, int *, float *, float *, int *); @@ -401,18 +374,12 @@ int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *, /* Level 3 routines */ -int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *); -int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *); -int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); +int BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); @@ -434,84 +401,48 @@ int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); - -int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); - -int BLASFUNC(ssymm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(csymm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); - -int BLASFUNC(csymm3m)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); - -int BLASFUNC(ssyrk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(csyrk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); - -int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(csyr2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); - -int BLASFUNC(chemm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +int BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +int BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(csymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(csymm3m)(char *, char *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); +int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); +int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); + +int BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(csyrk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(qsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(csyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(xsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); + +int BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(chemm3m)(char *, char *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); @@ -520,25 +451,16 @@ int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *, int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(cherk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); - -int BLASFUNC(cher2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); +int BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); int BLASFUNC(sgemt)(char *, int *, int *, float *, float *, int *, float *, int *); diff --git a/blas/common.h b/blas/common.h index 5ecb153e2..acb50af1b 100644 --- a/blas/common.h +++ b/blas/common.h @@ -104,18 +104,37 @@ matrix(T* data, int rows, int cols, int stride) return Map, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride)); } +template +Map, 0, OuterStride<> > +matrix(const T* data, int rows, int cols, int stride) +{ + return Map, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride)); +} + template Map, 0, InnerStride > make_vector(T* data, int size, int incr) { return Map, 0, InnerStride >(data, size, InnerStride(incr)); } +template +Map, 0, InnerStride > make_vector(const T* data, int size, int incr) +{ + return Map, 0, InnerStride >(data, size, InnerStride(incr)); +} + template Map > make_vector(T* data, int size) { return Map >(data, size); } +template +Map > make_vector(const T* data, int size) +{ + return Map >(data, size); +} + template T* get_compact_vector(T* x, int n, int incx) { diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 267a727ef..beb36c47d 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -9,7 +9,8 @@ #include #include "common.h" -int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(gemm)(const char *opa, const char *opb, const int *m, const int *n, const int *k, const RealScalar *palpha, + const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { // std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking&, Eigen::internal::GemmParallelInfo*); @@ -37,11 +38,11 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal 0 }; - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); int info = 0; if(OP(*opa)==INVALID) info = 1; @@ -74,7 +75,8 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal return 0; } -int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb) +int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n, + const RealScalar *palpha, const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb) { // std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n"; typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking&); @@ -137,9 +139,9 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, 0 }; - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); int info = 0; if(SIDE(*side)==INVALID) info = 1; @@ -178,7 +180,8 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, // b = alpha*op(a)*b for side = 'L'or'l' // b = alpha*b*op(a) for side = 'R'or'r' -int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb) +int EIGEN_BLAS_FUNC(trmm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n, + const RealScalar *palpha, const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb) { // std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); @@ -241,9 +244,9 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, 0 }; - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); int info = 0; if(SIDE(*side)==INVALID) info = 1; @@ -281,14 +284,15 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, // c = alpha*a*b + beta*c for side = 'L'or'l' // c = alpha*b*a + beta*c for side = 'R'or'r -int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(symm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha, + const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { // std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n"; - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); int info = 0; if(SIDE(*side)==INVALID) info = 1; @@ -350,7 +354,8 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa // c = alpha*a*a' + beta*c for op = 'N'or'n' // c = alpha*a'*a + beta*c for op = 'T'or't','C'or'c' -int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int *k, + const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { // std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; #if !ISCOMPLEX @@ -373,10 +378,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp }; #endif - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); int info = 0; if(UPLO(*uplo)==INVALID) info = 1; @@ -429,13 +434,14 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // c = alpha*a*b' + alpha*b*a' + beta*c for op = 'N'or'n' // c = alpha*a'*b + alpha*b'*a + beta*c for op = 'T'or't' -int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const int *k, const RealScalar *palpha, + const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); // std::cerr << "in syr2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n"; @@ -496,13 +502,14 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal // c = alpha*a*b + beta*c for side = 'L'or'l' // c = alpha*b*a + beta*c for side = 'R'or'r -int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(hemm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha, + const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); // std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n"; @@ -554,7 +561,8 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa // c = alpha*a*conj(a') + beta*c for op = 'N'or'n' // c = alpha*conj(a')*a + beta*c for op = 'C'or'c' -int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const int *k, + const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { // std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; @@ -574,7 +582,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp 0 }; - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* c = reinterpret_cast(pc); RealScalar alpha = *palpha; RealScalar beta = *pbeta; @@ -620,12 +628,13 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c, for op = 'N'or'n' // c = alpha*conj(a')*b + conj(alpha)*conj(b')*a + beta*c, for op = 'C'or'c' -int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(her2k)(const char *uplo, const char *op, const int *n, const int *k, + const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc) { - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); RealScalar beta = *pbeta; // std::cerr << "in her2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n"; -- cgit v1.2.3 From 6a9ca88e7e1bb72de621806b51c5a4fd17310943 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 15:17:14 +0200 Subject: Relax dependency on MKL for EIGEN_USE_BLAS --- .../products/GeneralMatrixMatrixTriangular_MKL.h | 24 +++++----- Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h | 17 +++---- Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 20 ++++----- .../Core/products/SelfadjointMatrixMatrix_MKL.h | 52 +++++++--------------- .../Core/products/SelfadjointMatrixVector_MKL.h | 16 +++---- .../src/Core/products/TriangularMatrixMatrix_MKL.h | 25 ++++------- .../src/Core/products/TriangularMatrixVector_MKL.h | 36 +++++++-------- .../src/Core/products/TriangularSolverMatrix_MKL.h | 28 +++++------- Eigen/src/Core/util/MKL_support.h | 12 ++++- 9 files changed, 98 insertions(+), 132 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h index 3deed068e..1cdf48fbf 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h @@ -50,25 +50,26 @@ template { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ { \ if (lhs==rhs) { \ general_matrix_matrix_rankupdate \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } \ } \ }; EIGEN_MKL_RANKUPDATE_SPECIALIZE(double) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) +// TODO handle complex cases +// EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) // SYRK for float/double #define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \ @@ -80,7 +81,7 @@ struct general_matrix_matrix_rankupdate& /*blocking*/) \ { \ /* typedef Matrix MatrixRhs;*/ \ \ @@ -105,7 +106,7 @@ struct general_matrix_matrix_rankupdate& /*blocking*/) \ { \ typedef Matrix MatrixType; \ \ @@ -132,11 +133,12 @@ struct general_matrix_matrix_rankupdate > map_x(rhs,cols,1,InnerStride<>(incx)); \ @@ -114,14 +112,14 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &numext::real_ref(beta), (MKLTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) -EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) -EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z) -EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c) +EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) +EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) +EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, double, z) +EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, float, c) } // end namespase internal diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h index dfa687fef..9c2e811dd 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h @@ -52,24 +52,19 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='L', uplo='L'; \ MKL_INT m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ /* Set m, n, k */ \ m = (MKL_INT)rows; \ n = (MKL_INT)cols; \ \ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ -\ /* Set lda, ldb, ldc */ \ lda = (MKL_INT)lhsStride; \ ldb = (MKL_INT)rhsStride; \ @@ -86,7 +81,7 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='L', uplo='L'; \ MKL_INT m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix a_tmp; \ - EIGTYPE myone(1); \ \ /* Set transpose options */ \ /* Set m, n, k */ \ m = (MKL_INT)rows; \ n = (MKL_INT)cols; \ \ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ -\ /* Set lda, ldb, ldc */ \ lda = (MKL_INT)lhsStride; \ ldb = (MKL_INT)rhsStride; \ @@ -154,15 +144,15 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='R', uplo='L'; \ MKL_INT m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set m, n, k */ \ m = (MKL_INT)rows; \ n = (MKL_INT)cols; \ \ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ -\ /* Set lda, ldb, ldc */ \ lda = (MKL_INT)rhsStride; \ ldb = (MKL_INT)lhsStride; \ @@ -212,7 +197,7 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='R', uplo='L'; \ MKL_INT m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix a_tmp; \ - EIGTYPE myone(1); \ \ /* Set m, n, k */ \ m = (MKL_INT)rows; \ n = (MKL_INT)cols; \ \ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ -\ /* Set lda, ldb, ldc */ \ lda = (MKL_INT)rhsStride; \ ldb = (MKL_INT)lhsStride; \ @@ -279,14 +259,14 @@ struct product_selfadjoint_matrix map_x(_rhs,size,1); \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + MKLFUNC(&uplo, &n, &numext::real_ref(alpha), (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &numext::real_ref(beta), (MKLTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv) -EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv) -EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) -EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, float, chemv_) } // end namespace internal diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h index d9e7cf852..31f6d2007 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h @@ -109,7 +109,8 @@ struct product_triangular_matrix_matrix_trmm(alpha_, alpha); \ \ /* Set m, n */ \ m = (MKL_INT)diagSize; \ @@ -175,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -184,9 +181,9 @@ struct product_triangular_matrix_matrix_trmm(alpha_, alpha); \ \ /* Set m, n */ \ m = (MKL_INT)rows; \ @@ -289,7 +282,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -298,9 +291,9 @@ struct product_triangular_matrix_matrix_trmm(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ n = (MKL_INT)size; \ @@ -123,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ n = (MKL_INT)size; \ @@ -207,10 +203,10 @@ struct triangular_matrix_vector_product_trmv dcomplex; typedef std::complex scomplex; +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +typedef int MKL_INT; +#endif + namespace internal { template @@ -125,6 +129,7 @@ static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenTyp mklScalar=eigenScalar; } +#ifdef EIGEN_USE_MKL template <> inline void assign_scalar_eig2mkl(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { mklScalar.real=eigenScalar.real(); @@ -148,11 +153,14 @@ inline void assign_conj_scalar_eig2mkl(MKL_Complex8& mklS mklScalar.real=eigenScalar.real(); mklScalar.imag=-eigenScalar.imag(); } +#endif } // end namespace internal } // end namespace Eigen +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" #endif #endif // EIGEN_MKL_SUPPORT_H -- cgit v1.2.3 From 8191f373befc6d02e473d99ce0d86e92ee3a8736 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 15:37:16 +0200 Subject: Silent unused warning. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h index 1cdf48fbf..91b949137 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h @@ -81,7 +81,7 @@ struct general_matrix_matrix_rankupdate& /*blocking*/) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ /* typedef Matrix MatrixRhs;*/ \ \ @@ -106,7 +106,7 @@ struct general_matrix_matrix_rankupdate& /*blocking*/) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ typedef Matrix MatrixType; \ \ -- cgit v1.2.3 From ddabc992faad25b8c1fca0d0c5ae35ea34e778a4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 15:52:01 +0200 Subject: Fix long to int conversion in BLAS API. --- .../products/GeneralMatrixMatrixTriangular_MKL.h | 20 +++--- Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h | 22 +++---- Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 11 ++-- .../Core/products/SelfadjointMatrixMatrix_MKL.h | 72 +++++++++++----------- .../Core/products/SelfadjointMatrixVector_MKL.h | 6 +- .../src/Core/products/TriangularMatrixMatrix_MKL.h | 36 +++++------ .../src/Core/products/TriangularMatrixVector_MKL.h | 48 +++++++-------- .../src/Core/products/TriangularSolverMatrix_MKL.h | 24 ++++---- Eigen/src/Core/util/MKL_support.h | 4 +- 9 files changed, 123 insertions(+), 120 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h index 91b949137..6c835372c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h @@ -72,7 +72,7 @@ EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) // EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) // SYRK for float/double -#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \ +#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, BLASTYPE, MKLFUNC) \ template \ struct general_matrix_matrix_rankupdate { \ enum { \ @@ -85,19 +85,19 @@ struct general_matrix_matrix_rankupdate MatrixRhs;*/ \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - MKLTYPE alpha_, beta_; \ + BLASTYPE alpha_, beta_; \ \ /* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ + assign_scalar_eig2mkl(alpha_, alpha); \ + assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ } \ }; // HERK for complex data -#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \ +#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, MKLFUNC) \ template \ struct general_matrix_matrix_rankupdate { \ enum { \ @@ -110,14 +110,14 @@ struct general_matrix_matrix_rankupdate MatrixType; \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ RTYPE alpha_, beta_; \ const EIGTYPE* a_ptr; \ \ /* Set alpha_ & beta_ */ \ -/* assign_scalar_eig2mkl(alpha_, alpha); */\ -/* assign_scalar_eig2mkl(beta_, EIGTYPE(1));*/ \ +/* assign_scalar_eig2mkl(alpha_, alpha); */\ +/* assign_scalar_eig2mkl(beta_, EIGTYPE(1));*/ \ alpha_ = alpha.real(); \ beta_ = 1.0; \ /* Copy with conjugation in some cases*/ \ @@ -128,7 +128,7 @@ struct general_matrix_matrix_rankupdate(rows); \ + n = convert_index(cols); \ + k = convert_index(depth); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ \ /* Set a, b, c */ \ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ Map > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ a_tmp = lhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index(a_tmp.outerStride()); \ } else a = _lhs; \ \ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ Map > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ b_tmp = rhs.conjugate(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &numext::real_ref(beta), (MKLTYPE*)res, &ldc); \ + MKLPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; GEMM_SPECIALIZATION(double, d, double, d) diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h index fa5c9b6a0..c447c4aed 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h @@ -85,7 +85,7 @@ EIGEN_MKL_GEMV_SPECIALIZE(float) EIGEN_MKL_GEMV_SPECIALIZE(dcomplex) EIGEN_MKL_GEMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \ +#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,MKLPREFIX) \ template \ struct general_matrix_vector_product_gemv \ { \ @@ -97,13 +97,14 @@ static void run( \ const EIGTYPE* rhs, Index rhsIncr, \ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ { \ - MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \ + BlasIndex m=convert_index(rows), n=convert_index(cols), \ + lda=convert_index(lhsStride), incx=convert_index(rhsIncr), incy=convert_index(resIncr); \ const EIGTYPE beta(1); \ const EIGTYPE *x_ptr; \ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ if (LhsStorageOrder==RowMajor) { \ - m = cols; \ - n = rows; \ + m = convert_index(cols); \ + n = convert_index(rows); \ }\ GEMVVector x_tmp; \ if (ConjugateRhs) { \ @@ -112,7 +113,7 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &numext::real_ref(beta), (MKLTYPE*)res, &incy); \ + MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h index 9c2e811dd..b1176962b 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -55,20 +55,20 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ + m = convert_index(rows); \ + n = convert_index(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ \ /* Set a, b, c */ \ if (LhsStorageOrder==RowMajor) uplo='U'; \ @@ -78,16 +78,16 @@ struct product_selfadjoint_matrix > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = rhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &numext::real_ref(beta), (MKLTYPE*)res, &ldc); \ + MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -101,7 +101,7 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ @@ -109,13 +109,13 @@ struct product_selfadjoint_matrix(rows); \ + n = convert_index(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ \ /* Set a, b, c */ \ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ @@ -141,10 +141,10 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - MKLPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &numext::real_ref(beta), (MKLTYPE*)res, &ldc); \ + MKLPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; @@ -157,7 +157,7 @@ EIGEN_MKL_HEMM_L(scomplex, float, cf, c) /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -172,19 +172,19 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ + m = convert_index(rows); \ + n = convert_index(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index(rhsStride); \ + ldb = convert_index(lhsStride); \ + ldc = convert_index(resStride); \ \ /* Set a, b, c */ \ if (RhsStorageOrder==RowMajor) uplo='U'; \ @@ -194,16 +194,16 @@ struct product_selfadjoint_matrix > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = lhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index(b_tmp.outerStride()); \ } else b = _lhs; \ \ - MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &numext::real_ref(beta), (MKLTYPE*)res, &ldc); \ + MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -217,27 +217,27 @@ struct product_selfadjoint_matrix& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix a_tmp; \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ + m = convert_index(rows); \ + n = convert_index(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index(rhsStride); \ + ldb = convert_index(lhsStride); \ + ldc = convert_index(resStride); \ \ /* Set a, b, c */ \ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ Map, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ a_tmp = rhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index(a_tmp.outerStride()); \ } else a = _rhs; \ if (RhsStorageOrder==RowMajor) uplo='U'; \ \ @@ -259,7 +259,7 @@ struct product_selfadjoint_matrix \ struct selfadjoint_matrix_vector_product_symv \ { \ @@ -85,7 +85,7 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ IsLower = UpLo == Lower ? 1 : 0 \ }; \ - MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \ + BlasIndex n=convert_index(size), lda=convert_index(lhsStride), incx=1, incy=1; \ EIGTYPE beta(1); \ const EIGTYPE *x_ptr; \ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ @@ -95,7 +95,7 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &numext::real_ref(alpha), (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &numext::real_ref(beta), (MKLTYPE*)res, &incy); \ + MKLFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h index 31f6d2007..47a8698a7 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h @@ -75,7 +75,7 @@ EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true) EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -122,7 +122,7 @@ struct product_triangular_matrix_matrix_trmm > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ MatrixLhs aa_tmp=lhsMap.template triangularView(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product::run( \ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ @@ -134,11 +134,11 @@ struct product_triangular_matrix_matrix_trmm(diagSize); \ + n = convert_index(cols); \ \ /* Set trans */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ @@ -149,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -165,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm(a_tmp.outerStride()); \ } else { \ a = _lhs; \ - lda = lhsStride; \ + lda = convert_index(lhsStride); \ } \ /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -186,7 +186,7 @@ EIGEN_MKL_TRMM_L(float, float, f, s) EIGEN_MKL_TRMM_L(scomplex, float, cf, c) // implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ @@ -232,7 +232,7 @@ struct product_triangular_matrix_matrix_trmm > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ MatrixRhs aa_tmp=rhsMap.template triangularView(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product::run( \ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ @@ -244,11 +244,11 @@ struct product_triangular_matrix_matrix_trmm(rows); \ + n = convert_index(diagSize); \ \ /* Set trans */ \ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ @@ -259,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -275,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm(a_tmp.outerStride()); \ } else { \ a = _rhs; \ - lda = rhsStride; \ + lda = convert_index(rhsStride); \ } \ /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h index 3aaea3457..17c9eeb44 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h @@ -71,7 +71,7 @@ EIGEN_MKL_TRMV_SPECIALIZE(dcomplex) EIGEN_MKL_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -105,15 +105,15 @@ struct triangular_matrix_vector_product_trmv(size); \ + lda = convert_index(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = 'N'; \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ + n = convert_index(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size*lda; \ - m = size; \ - n = cols-size; \ + m = convert_index(size); \ + n = convert_index(cols-size); \ } \ - MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &numext::real_ref(beta), (MKLTYPE*)y, &incy); \ + MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; @@ -153,7 +153,7 @@ EIGEN_MKL_TRMV_CM(float, float, f, s) EIGEN_MKL_TRMV_CM(scomplex, float, cf, c) // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_MKL_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -187,15 +187,15 @@ struct triangular_matrix_vector_product_trmv(size); \ + lda = convert_index(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = ConjLhs ? 'C' : 'T'; \ @@ -203,10 +203,10 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ + n = convert_index(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size; \ - m = size; \ - n = cols-size; \ + m = convert_index(size); \ + n = convert_index(cols-size); \ } \ - MKLPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &numext::real_ref(beta), (MKLTYPE*)y, &incy); \ + MKLPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h index 3677364e3..1f68a1cec 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h @@ -38,7 +38,7 @@ namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_MKL_TRSM_L(EIGTYPE, BLASTYPE, MKLPREFIX) \ template \ struct triangular_solve_matrix \ { \ @@ -53,11 +53,11 @@ struct triangular_solve_matrix& /*blocking*/) \ { \ - MKL_INT m = size, n = otherSize, lda, ldb; \ + BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ EIGTYPE alpha(1); \ - ldb = otherStride;\ + ldb = convert_index(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -73,14 +73,14 @@ struct triangular_solve_matrix(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ } \ }; @@ -91,7 +91,7 @@ EIGEN_MKL_TRSM_L(scomplex, float, c) // implements RightSide general * op(triangular)^-1 -#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_MKL_TRSM_R(EIGTYPE, BLASTYPE, MKLPREFIX) \ template \ struct triangular_solve_matrix \ { \ @@ -106,11 +106,11 @@ struct triangular_solve_matrix& /*blocking*/) \ { \ - MKL_INT m = otherSize, n = size, lda, ldb; \ + BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ EIGTYPE alpha(1); \ - ldb = otherStride;\ + ldb = convert_index(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -126,14 +126,14 @@ struct triangular_solve_matrix(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ /*std::cout << "TRMS_L specialization!\n";*/ \ } \ }; diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index de7847fc4..382014e66 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -114,7 +114,9 @@ typedef std::complex dcomplex; typedef std::complex scomplex; #if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) -typedef int MKL_INT; +typedef int BlasIndex; +#else +typedef MKL_INT BlasIndex; #endif namespace internal { -- cgit v1.2.3 From fec4c334bac76bfabd14168bf0ac668402f551a7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 16:04:09 +0200 Subject: Remove all references to MKL in BLAS wrappers. --- Eigen/Core | 16 +- .../products/GeneralMatrixMatrixTriangular_BLAS.h | 148 ++++++++++ .../products/GeneralMatrixMatrixTriangular_MKL.h | 148 ---------- Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h | 115 ++++++++ Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h | 115 -------- Eigen/src/Core/products/GeneralMatrixVector_BLAS.h | 129 +++++++++ Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 129 --------- .../Core/products/SelfadjointMatrixMatrix_BLAS.h | 275 +++++++++++++++++++ .../Core/products/SelfadjointMatrixMatrix_MKL.h | 275 ------------------- .../Core/products/SelfadjointMatrixVector_BLAS.h | 111 ++++++++ .../Core/products/SelfadjointMatrixVector_MKL.h | 111 -------- .../Core/products/TriangularMatrixMatrix_BLAS.h | 302 +++++++++++++++++++++ .../src/Core/products/TriangularMatrixMatrix_MKL.h | 302 --------------------- .../Core/products/TriangularMatrixVector_BLAS.h | 241 ++++++++++++++++ .../src/Core/products/TriangularMatrixVector_MKL.h | 241 ---------------- .../Core/products/TriangularSolverMatrix_BLAS.h | 151 +++++++++++ .../src/Core/products/TriangularSolverMatrix_MKL.h | 151 ----------- 17 files changed, 1480 insertions(+), 1480 deletions(-) create mode 100644 Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h delete mode 100644 Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h create mode 100644 Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h delete mode 100644 Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h create mode 100644 Eigen/src/Core/products/GeneralMatrixVector_BLAS.h delete mode 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h create mode 100644 Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h delete mode 100644 Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h create mode 100644 Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h delete mode 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h create mode 100644 Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h delete mode 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h create mode 100644 Eigen/src/Core/products/TriangularMatrixVector_BLAS.h delete mode 100644 Eigen/src/Core/products/TriangularMatrixVector_MKL.h create mode 100644 Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h delete mode 100644 Eigen/src/Core/products/TriangularSolverMatrix_MKL.h diff --git a/Eigen/Core b/Eigen/Core index 1e62f3ec1..30a572479 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -450,14 +450,14 @@ using std::ptrdiff_t; #include "src/Core/ArrayWrapper.h" #ifdef EIGEN_USE_BLAS -#include "src/Core/products/GeneralMatrixMatrix_MKL.h" -#include "src/Core/products/GeneralMatrixVector_MKL.h" -#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h" -#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h" -#include "src/Core/products/SelfadjointMatrixVector_MKL.h" -#include "src/Core/products/TriangularMatrixMatrix_MKL.h" -#include "src/Core/products/TriangularMatrixVector_MKL.h" -#include "src/Core/products/TriangularSolverMatrix_MKL.h" +#include "src/Core/products/GeneralMatrixMatrix_BLAS.h" +#include "src/Core/products/GeneralMatrixVector_BLAS.h" +#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h" +#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h" +#include "src/Core/products/SelfadjointMatrixVector_BLAS.h" +#include "src/Core/products/TriangularMatrixMatrix_BLAS.h" +#include "src/Core/products/TriangularMatrixVector_BLAS.h" +#include "src/Core/products/TriangularSolverMatrix_BLAS.h" #endif // EIGEN_USE_BLAS #ifdef EIGEN_USE_MKL_VML diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h new file mode 100644 index 000000000..943d25bd1 --- /dev/null +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -0,0 +1,148 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Level 3 BLAS SYRK/HERK implementation. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H + +namespace Eigen { + +namespace internal { + +template +struct general_matrix_matrix_rankupdate : + general_matrix_matrix_triangular_product< + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; + + +// try to go to BLAS specialization +#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \ +template \ +struct general_matrix_matrix_triangular_product { \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ + { \ + if (lhs==rhs) { \ + general_matrix_matrix_rankupdate \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + } else { \ + general_matrix_matrix_triangular_product \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + } \ + } \ +}; + +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float) +// TODO handle complex cases +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex) + +// SYRK for float/double +#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \ +template \ +struct general_matrix_matrix_rankupdate { \ + enum { \ + IsLower = (UpLo&Lower) == Lower, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ + }; \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + /* typedef Matrix MatrixRhs;*/ \ +\ + BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ + char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ + BLASTYPE alpha_, beta_; \ +\ +/* Set alpha_ & beta_ */ \ + assign_scalar_eig2mkl(alpha_, alpha); \ + assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ + } \ +}; + +// HERK for complex data +#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \ +template \ +struct general_matrix_matrix_rankupdate { \ + enum { \ + IsLower = (UpLo&Lower) == Lower, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ + }; \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + typedef Matrix MatrixType; \ +\ + BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ + char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ + RTYPE alpha_, beta_; \ + const EIGTYPE* a_ptr; \ +\ +/* Set alpha_ & beta_ */ \ +/* assign_scalar_eig2mkl(alpha_, alpha); */\ +/* assign_scalar_eig2mkl(beta_, EIGTYPE(1));*/ \ + alpha_ = alpha.real(); \ + beta_ = 1.0; \ +/* Copy with conjugation in some cases*/ \ + MatrixType a; \ + if (conjA) { \ + Map > mapA(lhs,n,k,OuterStride<>(lhsStride)); \ + a = mapA.conjugate(); \ + lda = a.outerStride(); \ + a_ptr = a.data(); \ + } else a_ptr=lhs; \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ + } \ +}; + + +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) + +// TODO hanlde complex cases +// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) +// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_) + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h deleted file mode 100644 index 6c835372c..000000000 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Level 3 BLAS SYRK/HERK implementation. - ******************************************************************************** -*/ - -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H - -namespace Eigen { - -namespace internal { - -template -struct general_matrix_matrix_rankupdate : - general_matrix_matrix_triangular_product< - Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; - - -// try to go to BLAS specialization -#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \ -template \ -struct general_matrix_matrix_triangular_product { \ - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ - { \ - if (lhs==rhs) { \ - general_matrix_matrix_rankupdate \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ - } else { \ - general_matrix_matrix_triangular_product \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ - } \ - } \ -}; - -EIGEN_MKL_RANKUPDATE_SPECIALIZE(double) -EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) -// TODO handle complex cases -// EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) -// EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) - -// SYRK for float/double -#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, BLASTYPE, MKLFUNC) \ -template \ -struct general_matrix_matrix_rankupdate { \ - enum { \ - IsLower = (UpLo&Lower) == Lower, \ - LowUp = IsLower ? Lower : Upper, \ - conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ - }; \ - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - /* typedef Matrix MatrixRhs;*/ \ -\ - BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ - char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - BLASTYPE alpha_, beta_; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ - } \ -}; - -// HERK for complex data -#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, MKLFUNC) \ -template \ -struct general_matrix_matrix_rankupdate { \ - enum { \ - IsLower = (UpLo&Lower) == Lower, \ - LowUp = IsLower ? Lower : Upper, \ - conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ - }; \ - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - typedef Matrix MatrixType; \ -\ - BlasIndex lda=convert_index(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ - char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ - RTYPE alpha_, beta_; \ - const EIGTYPE* a_ptr; \ -\ -/* Set alpha_ & beta_ */ \ -/* assign_scalar_eig2mkl(alpha_, alpha); */\ -/* assign_scalar_eig2mkl(beta_, EIGTYPE(1));*/ \ - alpha_ = alpha.real(); \ - beta_ = 1.0; \ -/* Copy with conjugation in some cases*/ \ - MatrixType a; \ - if (conjA) { \ - Map > mapA(lhs,n,k,OuterStride<>(lhsStride)); \ - a = mapA.conjugate(); \ - lda = a.outerStride(); \ - a_ptr = a.data(); \ - } else a_ptr=lhs; \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ - } \ -}; - - -EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk_) -EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk_) - -// TODO hanlde complex cases -// EIGEN_MKL_RANKUPDATE_C(dcomplex, double, double, zherk_) -// EIGEN_MKL_RANKUPDATE_C(scomplex, float, float, cherk_) - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h new file mode 100644 index 000000000..7a3bdbf20 --- /dev/null +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -0,0 +1,115 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * General matrix-matrix product functionality based on ?GEMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements general matrix-matrix multiplication using BLAS +* gemm function via partial specialization of +* general_matrix_matrix_product::run(..) method for float, double, +* std::complex and std::complex types +**********************************************************************/ + +// gemm specialization + +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ +template< \ + typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct general_matrix_matrix_product \ +{ \ +typedef gebp_traits Traits; \ +\ +static void run(Index rows, Index cols, Index depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, \ + level3_blocking& /*blocking*/, \ + GemmParallelInfo* /*info = 0*/) \ +{ \ + using std::conj; \ +\ + char transa, transb; \ + BlasIndex m, n, k, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX a_tmp, b_tmp; \ +\ +/* Set transpose options */ \ + transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ + transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set m, n, k */ \ + m = convert_index(rows); \ + n = convert_index(cols); \ + k = convert_index(depth); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ +\ +/* Set a, b, c */ \ + if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ + Map > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ + a_tmp = lhs.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else a = _lhs; \ +\ + if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ + Map > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ + b_tmp = rhs.conjugate(); \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ + } else b = _rhs; \ +\ + BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +}}; + +GEMM_SPECIALIZATION(double, d, double, d) +GEMM_SPECIALIZATION(float, f, float, s) +GEMM_SPECIALIZATION(dcomplex, cd, double, z) +GEMM_SPECIALIZATION(scomplex, cf, float, c) + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h deleted file mode 100644 index 299faf2f2..000000000 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * General matrix-matrix product functionality based on ?GEMM. - ******************************************************************************** -*/ - -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H - -namespace Eigen { - -namespace internal { - -/********************************************************************** -* This file implements general matrix-matrix multiplication using BLAS -* gemm function via partial specialization of -* general_matrix_matrix_product::run(..) method for float, double, -* std::complex and std::complex types -**********************************************************************/ - -// gemm specialization - -#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, MKLPREFIX) \ -template< \ - typename Index, \ - int LhsStorageOrder, bool ConjugateLhs, \ - int RhsStorageOrder, bool ConjugateRhs> \ -struct general_matrix_matrix_product \ -{ \ -typedef gebp_traits Traits; \ -\ -static void run(Index rows, Index cols, Index depth, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, \ - level3_blocking& /*blocking*/, \ - GemmParallelInfo* /*info = 0*/) \ -{ \ - using std::conj; \ -\ - char transa, transb; \ - BlasIndex m, n, k, lda, ldb, ldc; \ - const EIGTYPE *a, *b; \ - EIGTYPE beta(1); \ - MatrixX##EIGPREFIX a_tmp, b_tmp; \ -\ -/* Set transpose options */ \ - transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ - transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ -\ -/* Set m, n, k */ \ - m = convert_index(rows); \ - n = convert_index(cols); \ - k = convert_index(depth); \ -\ -/* Set lda, ldb, ldc */ \ - lda = convert_index(lhsStride); \ - ldb = convert_index(rhsStride); \ - ldc = convert_index(resStride); \ -\ -/* Set a, b, c */ \ - if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ - Map > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ - a_tmp = lhs.conjugate(); \ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else a = _lhs; \ -\ - if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ - Map > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ - b_tmp = rhs.conjugate(); \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ - } else b = _rhs; \ -\ - MKLPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ -}}; - -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) - -} // end namespase internal - -} // end namespace Eigen - -#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h new file mode 100644 index 000000000..e3a5d5892 --- /dev/null +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -0,0 +1,129 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * General matrix-vector product functionality based on ?GEMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements general matrix-vector multiplication using BLAS +* gemv function via partial specialization of +* general_matrix_vector_product::run(..) method for float, double, +* std::complex and std::complex types +**********************************************************************/ + +// gemv specialization + +template +struct general_matrix_vector_product_gemv; + +#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \ +template \ +struct general_matrix_vector_product,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,Specialized> { \ +static void run( \ + Index rows, Index cols, \ + const const_blas_data_mapper &lhs, \ + const const_blas_data_mapper &rhs, \ + Scalar* res, Index resIncr, Scalar alpha) \ +{ \ + if (ConjugateLhs) { \ + general_matrix_vector_product,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,BuiltIn>::run( \ + rows, cols, lhs, rhs, res, resIncr, alpha); \ + } else { \ + general_matrix_vector_product_gemv::run( \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ + } \ +} \ +}; \ +template \ +struct general_matrix_vector_product,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,Specialized> { \ +static void run( \ + Index rows, Index cols, \ + const const_blas_data_mapper &lhs, \ + const const_blas_data_mapper &rhs, \ + Scalar* res, Index resIncr, Scalar alpha) \ +{ \ + general_matrix_vector_product_gemv::run( \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ +} \ +}; \ + +EIGEN_BLAS_GEMV_SPECIALIZE(double) +EIGEN_BLAS_GEMV_SPECIALIZE(float) +EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) + +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ +template \ +struct general_matrix_vector_product_gemv \ +{ \ +typedef Matrix GEMVVector;\ +\ +static void run( \ + Index rows, Index cols, \ + const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* rhs, Index rhsIncr, \ + EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ +{ \ + BlasIndex m=convert_index(rows), n=convert_index(cols), \ + lda=convert_index(lhsStride), incx=convert_index(rhsIncr), incy=convert_index(resIncr); \ + const EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ + char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ + if (LhsStorageOrder==RowMajor) { \ + m = convert_index(cols); \ + n = convert_index(rows); \ + }\ + GEMVVector x_tmp; \ + if (ConjugateRhs) { \ + Map > map_x(rhs,cols,1,InnerStride<>(incx)); \ + x_tmp=map_x.conjugate(); \ + x_ptr=x_tmp.data(); \ + incx=1; \ + } else x_ptr=rhs; \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ +}\ +}; + +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h deleted file mode 100644 index c447c4aed..000000000 --- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * General matrix-vector product functionality based on ?GEMV. - ******************************************************************************** -*/ - -#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H -#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H - -namespace Eigen { - -namespace internal { - -/********************************************************************** -* This file implements general matrix-vector multiplication using BLAS -* gemv function via partial specialization of -* general_matrix_vector_product::run(..) method for float, double, -* std::complex and std::complex types -**********************************************************************/ - -// gemv specialization - -template -struct general_matrix_vector_product_gemv; - -#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \ -template \ -struct general_matrix_vector_product,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,Specialized> { \ -static void run( \ - Index rows, Index cols, \ - const const_blas_data_mapper &lhs, \ - const const_blas_data_mapper &rhs, \ - Scalar* res, Index resIncr, Scalar alpha) \ -{ \ - if (ConjugateLhs) { \ - general_matrix_vector_product,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,BuiltIn>::run( \ - rows, cols, lhs, rhs, res, resIncr, alpha); \ - } else { \ - general_matrix_vector_product_gemv::run( \ - rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ - } \ -} \ -}; \ -template \ -struct general_matrix_vector_product,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper,ConjugateRhs,Specialized> { \ -static void run( \ - Index rows, Index cols, \ - const const_blas_data_mapper &lhs, \ - const const_blas_data_mapper &rhs, \ - Scalar* res, Index resIncr, Scalar alpha) \ -{ \ - general_matrix_vector_product_gemv::run( \ - rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ -} \ -}; \ - -EIGEN_MKL_GEMV_SPECIALIZE(double) -EIGEN_MKL_GEMV_SPECIALIZE(float) -EIGEN_MKL_GEMV_SPECIALIZE(dcomplex) -EIGEN_MKL_GEMV_SPECIALIZE(scomplex) - -#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,MKLPREFIX) \ -template \ -struct general_matrix_vector_product_gemv \ -{ \ -typedef Matrix GEMVVector;\ -\ -static void run( \ - Index rows, Index cols, \ - const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsIncr, \ - EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ -{ \ - BlasIndex m=convert_index(rows), n=convert_index(cols), \ - lda=convert_index(lhsStride), incx=convert_index(rhsIncr), incy=convert_index(resIncr); \ - const EIGTYPE beta(1); \ - const EIGTYPE *x_ptr; \ - char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ - if (LhsStorageOrder==RowMajor) { \ - m = convert_index(cols); \ - n = convert_index(rows); \ - }\ - GEMVVector x_tmp; \ - if (ConjugateRhs) { \ - Map > map_x(rhs,cols,1,InnerStride<>(incx)); \ - x_tmp=map_x.conjugate(); \ - x_ptr=x_tmp.data(); \ - incx=1; \ - } else x_ptr=rhs; \ - MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ -}\ -}; - -EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) -EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) -EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, float, c) - -} // end namespase internal - -} // end namespace Eigen - -#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h new file mode 100644 index 000000000..c3e37b1e0 --- /dev/null +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -0,0 +1,275 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + + +/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ + +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_selfadjoint_matrix \ +{\ +\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + char side='L', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ +\ +/* Set transpose options */ \ +/* Set m, n, k */ \ + m = convert_index(rows); \ + n = convert_index(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ +\ +/* Set a, b, c */ \ + if (LhsStorageOrder==RowMajor) uplo='U'; \ + a = _lhs; \ +\ + if (RhsStorageOrder==RowMajor) { \ + Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.adjoint(); \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ + } else b = _rhs; \ +\ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + + +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_selfadjoint_matrix \ +{\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + char side='L', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ + Matrix a_tmp; \ +\ +/* Set transpose options */ \ +/* Set m, n, k */ \ + m = convert_index(rows); \ + n = convert_index(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index(lhsStride); \ + ldb = convert_index(rhsStride); \ + ldc = convert_index(resStride); \ +\ +/* Set a, b, c */ \ + if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ + Map, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \ + a_tmp = lhs.conjugate(); \ + a = a_tmp.data(); \ + lda = a_tmp.outerStride(); \ + } else a = _lhs; \ + if (LhsStorageOrder==RowMajor) uplo='U'; \ +\ + if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \ + b = _rhs; } \ + else { \ + if (RhsStorageOrder==ColMajor && ConjugateRhs) { \ + Map > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \ + b_tmp = rhs.conjugate(); \ + } else \ + if (ConjugateRhs) { \ + Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.adjoint(); \ + } else { \ + Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.transpose(); \ + } \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ + } \ +\ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + +EIGEN_BLAS_SYMM_L(double, double, d, d) +EIGEN_BLAS_SYMM_L(float, float, f, s) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) + + +/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ + +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_selfadjoint_matrix \ +{\ +\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + char side='R', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ +\ +/* Set m, n, k */ \ + m = convert_index(rows); \ + n = convert_index(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index(rhsStride); \ + ldb = convert_index(lhsStride); \ + ldc = convert_index(resStride); \ +\ +/* Set a, b, c */ \ + if (RhsStorageOrder==RowMajor) uplo='U'; \ + a = _rhs; \ +\ + if (LhsStorageOrder==RowMajor) { \ + Map > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = lhs.adjoint(); \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ + } else b = _lhs; \ +\ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + + +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_selfadjoint_matrix \ +{\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& /*blocking*/) \ + { \ + char side='R', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ + Matrix a_tmp; \ +\ +/* Set m, n, k */ \ + m = convert_index(rows); \ + n = convert_index(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index(rhsStride); \ + ldb = convert_index(lhsStride); \ + ldc = convert_index(resStride); \ +\ +/* Set a, b, c */ \ + if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ + Map, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ + a_tmp = rhs.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else a = _rhs; \ + if (RhsStorageOrder==RowMajor) uplo='U'; \ +\ + if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \ + b = _lhs; } \ + else { \ + if (LhsStorageOrder==ColMajor && ConjugateLhs) { \ + Map > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \ + b_tmp = lhs.conjugate(); \ + } else \ + if (ConjugateLhs) { \ + Map > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ + b_tmp = lhs.adjoint(); \ + } else { \ + Map > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ + b_tmp = lhs.transpose(); \ + } \ + b = b_tmp.data(); \ + ldb = b_tmp.outerStride(); \ + } \ +\ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + } \ +}; + +EIGEN_BLAS_SYMM_R(double, double, d, d) +EIGEN_BLAS_SYMM_R(float, float, f, s) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h deleted file mode 100644 index b1176962b..000000000 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ /dev/null @@ -1,275 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. - ******************************************************************************** -*/ - -#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H - -namespace Eigen { - -namespace internal { - - -/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ - -#define EIGEN_MKL_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_selfadjoint_matrix \ -{\ -\ - static void run( \ - Index rows, Index cols, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - char side='L', uplo='L'; \ - BlasIndex m, n, lda, ldb, ldc; \ - const EIGTYPE *a, *b; \ - EIGTYPE beta(1); \ - MatrixX##EIGPREFIX b_tmp; \ -\ -/* Set transpose options */ \ -/* Set m, n, k */ \ - m = convert_index(rows); \ - n = convert_index(cols); \ -\ -/* Set lda, ldb, ldc */ \ - lda = convert_index(lhsStride); \ - ldb = convert_index(rhsStride); \ - ldc = convert_index(resStride); \ -\ -/* Set a, b, c */ \ - if (LhsStorageOrder==RowMajor) uplo='U'; \ - a = _lhs; \ -\ - if (RhsStorageOrder==RowMajor) { \ - Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ - b_tmp = rhs.adjoint(); \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ - } else b = _rhs; \ -\ - MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ -\ - } \ -}; - - -#define EIGEN_MKL_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_selfadjoint_matrix \ -{\ - static void run( \ - Index rows, Index cols, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - char side='L', uplo='L'; \ - BlasIndex m, n, lda, ldb, ldc; \ - const EIGTYPE *a, *b; \ - EIGTYPE beta(1); \ - MatrixX##EIGPREFIX b_tmp; \ - Matrix a_tmp; \ -\ -/* Set transpose options */ \ -/* Set m, n, k */ \ - m = convert_index(rows); \ - n = convert_index(cols); \ -\ -/* Set lda, ldb, ldc */ \ - lda = convert_index(lhsStride); \ - ldb = convert_index(rhsStride); \ - ldc = convert_index(resStride); \ -\ -/* Set a, b, c */ \ - if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ - Map, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \ - a_tmp = lhs.conjugate(); \ - a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ - } else a = _lhs; \ - if (LhsStorageOrder==RowMajor) uplo='U'; \ -\ - if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \ - b = _rhs; } \ - else { \ - if (RhsStorageOrder==ColMajor && ConjugateRhs) { \ - Map > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \ - b_tmp = rhs.conjugate(); \ - } else \ - if (ConjugateRhs) { \ - Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ - b_tmp = rhs.adjoint(); \ - } else { \ - Map > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ - b_tmp = rhs.transpose(); \ - } \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ - } \ -\ - MKLPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ -\ - } \ -}; - -EIGEN_MKL_SYMM_L(double, double, d, d) -EIGEN_MKL_SYMM_L(float, float, f, s) -EIGEN_MKL_HEMM_L(dcomplex, double, cd, z) -EIGEN_MKL_HEMM_L(scomplex, float, cf, c) - - -/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ - -#define EIGEN_MKL_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_selfadjoint_matrix \ -{\ -\ - static void run( \ - Index rows, Index cols, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - char side='R', uplo='L'; \ - BlasIndex m, n, lda, ldb, ldc; \ - const EIGTYPE *a, *b; \ - EIGTYPE beta(1); \ - MatrixX##EIGPREFIX b_tmp; \ -\ -/* Set m, n, k */ \ - m = convert_index(rows); \ - n = convert_index(cols); \ -\ -/* Set lda, ldb, ldc */ \ - lda = convert_index(rhsStride); \ - ldb = convert_index(lhsStride); \ - ldc = convert_index(resStride); \ -\ -/* Set a, b, c */ \ - if (RhsStorageOrder==RowMajor) uplo='U'; \ - a = _rhs; \ -\ - if (LhsStorageOrder==RowMajor) { \ - Map > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ - b_tmp = lhs.adjoint(); \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ - } else b = _lhs; \ -\ - MKLPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ -\ - } \ -}; - - -#define EIGEN_MKL_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_selfadjoint_matrix \ -{\ - static void run( \ - Index rows, Index cols, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& /*blocking*/) \ - { \ - char side='R', uplo='L'; \ - BlasIndex m, n, lda, ldb, ldc; \ - const EIGTYPE *a, *b; \ - EIGTYPE beta(1); \ - MatrixX##EIGPREFIX b_tmp; \ - Matrix a_tmp; \ -\ -/* Set m, n, k */ \ - m = convert_index(rows); \ - n = convert_index(cols); \ -\ -/* Set lda, ldb, ldc */ \ - lda = convert_index(rhsStride); \ - ldb = convert_index(lhsStride); \ - ldc = convert_index(resStride); \ -\ -/* Set a, b, c */ \ - if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ - Map, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ - a_tmp = rhs.conjugate(); \ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else a = _rhs; \ - if (RhsStorageOrder==RowMajor) uplo='U'; \ -\ - if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \ - b = _lhs; } \ - else { \ - if (LhsStorageOrder==ColMajor && ConjugateLhs) { \ - Map > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \ - b_tmp = lhs.conjugate(); \ - } else \ - if (ConjugateLhs) { \ - Map > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ - b_tmp = lhs.adjoint(); \ - } else { \ - Map > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ - b_tmp = lhs.transpose(); \ - } \ - b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ - } \ -\ - MKLPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ - } \ -}; - -EIGEN_MKL_SYMM_R(double, double, d, d) -EIGEN_MKL_SYMM_R(float, float, f, s) -EIGEN_MKL_HEMM_R(dcomplex, double, cd, z) -EIGEN_MKL_HEMM_R(scomplex, float, cf, c) - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h new file mode 100644 index 000000000..38f23accf --- /dev/null +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -0,0 +1,111 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements selfadjoint matrix-vector multiplication using BLAS +**********************************************************************/ + +// symv/hemv specialization + +template +struct selfadjoint_matrix_vector_product_symv : + selfadjoint_matrix_vector_product {}; + +#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \ +template \ +struct selfadjoint_matrix_vector_product { \ +static void run( \ + Index size, const Scalar* lhs, Index lhsStride, \ + const Scalar* _rhs, Scalar* res, Scalar alpha) { \ + enum {\ + IsColMajor = StorageOrder==ColMajor \ + }; \ + if (IsColMajor == ConjugateLhs) {\ + selfadjoint_matrix_vector_product::run( \ + size, lhs, lhsStride, _rhs, res, alpha); \ + } else {\ + selfadjoint_matrix_vector_product_symv::run( \ + size, lhs, lhsStride, _rhs, res, alpha); \ + }\ + } \ +}; \ + +EIGEN_BLAS_SYMV_SPECIALIZE(double) +EIGEN_BLAS_SYMV_SPECIALIZE(float) +EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(scomplex) + +#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ +template \ +struct selfadjoint_matrix_vector_product_symv \ +{ \ +typedef Matrix SYMVVector;\ +\ +static void run( \ +Index size, const EIGTYPE* lhs, Index lhsStride, \ +const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ +{ \ + enum {\ + IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ + IsLower = UpLo == Lower ? 1 : 0 \ + }; \ + BlasIndex n=convert_index(size), lda=convert_index(lhsStride), incx=1, incy=1; \ + EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ + char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ + SYMVVector x_tmp; \ + if (ConjugateRhs) { \ + Map map_x(_rhs,size,1); \ + x_tmp=map_x.conjugate(); \ + x_ptr=x_tmp.data(); \ + } else x_ptr=_rhs; \ + BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ +}\ +}; + +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h deleted file mode 100644 index 2a8362202..000000000 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. - ******************************************************************************** -*/ - -#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H - -namespace Eigen { - -namespace internal { - -/********************************************************************** -* This file implements selfadjoint matrix-vector multiplication using BLAS -**********************************************************************/ - -// symv/hemv specialization - -template -struct selfadjoint_matrix_vector_product_symv : - selfadjoint_matrix_vector_product {}; - -#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \ -template \ -struct selfadjoint_matrix_vector_product { \ -static void run( \ - Index size, const Scalar* lhs, Index lhsStride, \ - const Scalar* _rhs, Scalar* res, Scalar alpha) { \ - enum {\ - IsColMajor = StorageOrder==ColMajor \ - }; \ - if (IsColMajor == ConjugateLhs) {\ - selfadjoint_matrix_vector_product::run( \ - size, lhs, lhsStride, _rhs, res, alpha); \ - } else {\ - selfadjoint_matrix_vector_product_symv::run( \ - size, lhs, lhsStride, _rhs, res, alpha); \ - }\ - } \ -}; \ - -EIGEN_MKL_SYMV_SPECIALIZE(double) -EIGEN_MKL_SYMV_SPECIALIZE(float) -EIGEN_MKL_SYMV_SPECIALIZE(dcomplex) -EIGEN_MKL_SYMV_SPECIALIZE(scomplex) - -#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,MKLFUNC) \ -template \ -struct selfadjoint_matrix_vector_product_symv \ -{ \ -typedef Matrix SYMVVector;\ -\ -static void run( \ -Index size, const EIGTYPE* lhs, Index lhsStride, \ -const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ -{ \ - enum {\ - IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ - IsLower = UpLo == Lower ? 1 : 0 \ - }; \ - BlasIndex n=convert_index(size), lda=convert_index(lhsStride), incx=1, incy=1; \ - EIGTYPE beta(1); \ - const EIGTYPE *x_ptr; \ - char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ - SYMVVector x_tmp; \ - if (ConjugateRhs) { \ - Map map_x(_rhs,size,1); \ - x_tmp=map_x.conjugate(); \ - x_ptr=x_tmp.data(); \ - } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ -}\ -}; - -EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv_) -EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv_) -EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) -EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, float, chemv_) - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h new file mode 100644 index 000000000..aecded6bb --- /dev/null +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -0,0 +1,302 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix * matrix product functionality based on ?TRMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + + +template +struct product_triangular_matrix_matrix_trmm : + product_triangular_matrix_matrix {}; + + +// try to go to BLAS specialization +#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ +template \ +struct product_triangular_matrix_matrix { \ + static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + product_triangular_matrix_matrix_trmm::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + } \ +}; + +EIGEN_BLAS_TRMM_SPECIALIZE(double, true) +EIGEN_BLAS_TRMM_SPECIALIZE(double, false) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(float, true) +EIGEN_BLAS_TRMM_SPECIALIZE(float, false) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) + +// implements col-major += alpha * op(triangular) * op(general) +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_triangular_matrix_matrix_trmm \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \ + }; \ +\ + static void run( \ + Index _rows, Index _cols, Index _depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& blocking) \ + { \ + Index diagSize = (std::min)(_rows,_depth); \ + Index rows = IsLower ? _rows : diagSize; \ + Index depth = IsLower ? diagSize : _depth; \ + Index cols = _cols; \ +\ + typedef Matrix MatrixLhs; \ + typedef Matrix MatrixRhs; \ +\ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ + if (rows != depth) { \ +\ + /* FIXME handle mkl_domain_get_max_threads */ \ + /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\ +\ + if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ + /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ + product_triangular_matrix_matrix::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ + } else { \ + /* Make sense to call GEMM */ \ + Map > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixLhs aa_tmp=lhsMap.template triangularView(); \ + BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ + gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ +\ + /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ + } \ + return; \ + } \ + char side = 'L', transa, uplo, diag = 'N'; \ + EIGTYPE *b; \ + const EIGTYPE *a; \ + BlasIndex m, n, lda, ldb; \ +\ +/* Set m, n */ \ + m = convert_index(diagSize); \ + n = convert_index(cols); \ +\ +/* Set trans */ \ + transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set b, ldb */ \ + Map > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \ + MatrixX##EIGPREFIX b_tmp; \ +\ + if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ +\ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + Map > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixLhs a_tmp; \ +\ + if ((conjA!=0) || (SetDiag==0)) { \ + if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \ + if (IsZeroDiag) \ + a_tmp.diagonal().setZero(); \ + else if (IsUnitDiag) \ + a_tmp.diagonal().setOnes();\ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else { \ + a = _lhs; \ + lda = convert_index(lhsStride); \ + } \ + /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ +/* call ?trmm*/ \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ +\ +/* Add op(a_triangular)*b into res*/ \ + Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ + res_tmp=res_tmp+b_tmp; \ + } \ +}; + +EIGEN_BLAS_TRMM_L(double, double, d, d) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_L(float, float, f, s) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) + +// implements col-major += alpha * op(general) * op(triangular) +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct product_triangular_matrix_matrix_trmm \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \ + }; \ +\ + static void run( \ + Index _rows, Index _cols, Index _depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking& blocking) \ + { \ + Index diagSize = (std::min)(_cols,_depth); \ + Index rows = _rows; \ + Index depth = IsLower ? _depth : diagSize; \ + Index cols = IsLower ? diagSize : _cols; \ +\ + typedef Matrix MatrixLhs; \ + typedef Matrix MatrixRhs; \ +\ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ + if (cols != depth) { \ +\ + int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \ +\ + if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ + /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ + product_triangular_matrix_matrix::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ + } else { \ + /* Make sense to call GEMM */ \ + Map > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ + MatrixRhs aa_tmp=rhsMap.template triangularView(); \ + BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ + gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ +\ + /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ + } \ + return; \ + } \ + char side = 'R', transa, uplo, diag = 'N'; \ + EIGTYPE *b; \ + const EIGTYPE *a; \ + BlasIndex m, n, lda, ldb; \ +\ +/* Set m, n */ \ + m = convert_index(rows); \ + n = convert_index(diagSize); \ +\ +/* Set trans */ \ + transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set b, ldb */ \ + Map > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixX##EIGPREFIX b_tmp; \ +\ + if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ + b = b_tmp.data(); \ + ldb = convert_index(b_tmp.outerStride()); \ +\ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + Map > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \ + MatrixRhs a_tmp; \ +\ + if ((conjA!=0) || (SetDiag==0)) { \ + if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \ + if (IsZeroDiag) \ + a_tmp.diagonal().setZero(); \ + else if (IsUnitDiag) \ + a_tmp.diagonal().setOnes();\ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else { \ + a = _rhs; \ + lda = convert_index(rhsStride); \ + } \ + /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ +/* call ?trmm*/ \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ +\ +/* Add op(a_triangular)*b into res*/ \ + Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ + res_tmp=res_tmp+b_tmp; \ + } \ +}; + +EIGEN_BLAS_TRMM_R(double, double, d, d) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_R(float, float, f, s) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h deleted file mode 100644 index 47a8698a7..000000000 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ /dev/null @@ -1,302 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Triangular matrix * matrix product functionality based on ?TRMM. - ******************************************************************************** -*/ - -#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H - -namespace Eigen { - -namespace internal { - - -template -struct product_triangular_matrix_matrix_trmm : - product_triangular_matrix_matrix {}; - - -// try to go to BLAS specialization -#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ -template \ -struct product_triangular_matrix_matrix { \ - static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ - const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) { \ - product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ - } \ -}; - -EIGEN_MKL_TRMM_SPECIALIZE(double, true) -EIGEN_MKL_TRMM_SPECIALIZE(double, false) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false) -EIGEN_MKL_TRMM_SPECIALIZE(float, true) -EIGEN_MKL_TRMM_SPECIALIZE(float, false) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false) - -// implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_MKL_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_triangular_matrix_matrix_trmm \ -{ \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - LowUp = IsLower ? Lower : Upper, \ - conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \ - }; \ -\ - static void run( \ - Index _rows, Index _cols, Index _depth, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& blocking) \ - { \ - Index diagSize = (std::min)(_rows,_depth); \ - Index rows = IsLower ? _rows : diagSize; \ - Index depth = IsLower ? diagSize : _depth; \ - Index cols = _cols; \ -\ - typedef Matrix MatrixLhs; \ - typedef Matrix MatrixRhs; \ -\ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ - if (rows != depth) { \ -\ - /* FIXME handle mkl_domain_get_max_threads */ \ - /*int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS);*/ int nthr = 1;\ -\ - if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ - product_triangular_matrix_matrix::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ - /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ - } else { \ - /* Make sense to call GEMM */ \ - Map > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ - MatrixLhs aa_tmp=lhsMap.template triangularView(); \ - BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ - gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ -\ - /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ - } \ - return; \ - } \ - char side = 'L', transa, uplo, diag = 'N'; \ - EIGTYPE *b; \ - const EIGTYPE *a; \ - BlasIndex m, n, lda, ldb; \ -\ -/* Set m, n */ \ - m = convert_index(diagSize); \ - n = convert_index(cols); \ -\ -/* Set trans */ \ - transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ -\ -/* Set b, ldb */ \ - Map > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \ - MatrixX##EIGPREFIX b_tmp; \ -\ - if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ -\ -/* Set uplo */ \ - uplo = IsLower ? 'L' : 'U'; \ - if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ -/* Set a, lda */ \ - Map > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ - MatrixLhs a_tmp; \ -\ - if ((conjA!=0) || (SetDiag==0)) { \ - if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \ - if (IsZeroDiag) \ - a_tmp.diagonal().setZero(); \ - else if (IsUnitDiag) \ - a_tmp.diagonal().setOnes();\ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else { \ - a = _lhs; \ - lda = convert_index(lhsStride); \ - } \ - /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \ -/* call ?trmm*/ \ - MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ -\ -/* Add op(a_triangular)*b into res*/ \ - Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ - res_tmp=res_tmp+b_tmp; \ - } \ -}; - -EIGEN_MKL_TRMM_L(double, double, d, d) -EIGEN_MKL_TRMM_L(dcomplex, double, cd, z) -EIGEN_MKL_TRMM_L(float, float, f, s) -EIGEN_MKL_TRMM_L(scomplex, float, cf, c) - -// implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_MKL_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct product_triangular_matrix_matrix_trmm \ -{ \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - LowUp = IsLower ? Lower : Upper, \ - conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \ - }; \ -\ - static void run( \ - Index _rows, Index _cols, Index _depth, \ - const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ - EIGTYPE alpha, level3_blocking& blocking) \ - { \ - Index diagSize = (std::min)(_cols,_depth); \ - Index rows = _rows; \ - Index depth = IsLower ? _depth : diagSize; \ - Index cols = IsLower ? diagSize : _cols; \ -\ - typedef Matrix MatrixLhs; \ - typedef Matrix MatrixRhs; \ -\ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ - if (cols != depth) { \ -\ - int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS)*/; \ -\ - if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ - product_triangular_matrix_matrix::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ - /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ - } else { \ - /* Make sense to call GEMM */ \ - Map > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ - MatrixRhs aa_tmp=rhsMap.template triangularView(); \ - BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ - gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ -\ - /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ - } \ - return; \ - } \ - char side = 'R', transa, uplo, diag = 'N'; \ - EIGTYPE *b; \ - const EIGTYPE *a; \ - BlasIndex m, n, lda, ldb; \ -\ -/* Set m, n */ \ - m = convert_index(rows); \ - n = convert_index(diagSize); \ -\ -/* Set trans */ \ - transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ -\ -/* Set b, ldb */ \ - Map > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ - MatrixX##EIGPREFIX b_tmp; \ -\ - if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ - b = b_tmp.data(); \ - ldb = convert_index(b_tmp.outerStride()); \ -\ -/* Set uplo */ \ - uplo = IsLower ? 'L' : 'U'; \ - if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ -/* Set a, lda */ \ - Map > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \ - MatrixRhs a_tmp; \ -\ - if ((conjA!=0) || (SetDiag==0)) { \ - if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \ - if (IsZeroDiag) \ - a_tmp.diagonal().setZero(); \ - else if (IsUnitDiag) \ - a_tmp.diagonal().setOnes();\ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else { \ - a = _rhs; \ - lda = convert_index(rhsStride); \ - } \ - /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \ -/* call ?trmm*/ \ - MKLPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ -\ -/* Add op(a_triangular)*b into res*/ \ - Map > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ - res_tmp=res_tmp+b_tmp; \ - } \ -}; - -EIGEN_MKL_TRMM_R(double, double, d, d) -EIGEN_MKL_TRMM_R(dcomplex, double, cd, z) -EIGEN_MKL_TRMM_R(float, float, f, s) -EIGEN_MKL_TRMM_R(scomplex, float, cf, c) - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h new file mode 100644 index 000000000..07bf26ce5 --- /dev/null +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -0,0 +1,241 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix-vector product functionality based on ?TRMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements triangular matrix-vector multiplication using BLAS +**********************************************************************/ + +// trmv/hemv specialization + +template +struct triangular_matrix_vector_product_trmv : + triangular_matrix_vector_product {}; + +#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ +template \ +struct triangular_matrix_vector_product { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + triangular_matrix_vector_product_trmv::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + } \ +}; \ +template \ +struct triangular_matrix_vector_product { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + triangular_matrix_vector_product_trmv::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + } \ +}; + +EIGEN_BLAS_TRMV_SPECIALIZE(double) +EIGEN_BLAS_TRMV_SPECIALIZE(float) +EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) + +// implements col-major: res += alpha * op(triangular) * vector +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct triangular_matrix_vector_product_trmv { \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper \ + }; \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + { \ + if (ConjLhs || IsZeroDiag) { \ + triangular_matrix_vector_product::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + return; \ + }\ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ +\ + typedef VectorX##EIGPREFIX VectorRhs; \ + EIGTYPE *x, *y;\ +\ +/* Set x*/ \ + Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + VectorRhs x_tmp; \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ +\ +/* Square part handling */\ +\ + char trans, uplo, diag; \ + BlasIndex m, n, lda, incx, incy; \ + EIGTYPE const *a; \ + EIGTYPE beta(1); \ +\ +/* Set m, n */ \ + n = convert_index(size); \ + lda = convert_index(lhsStride); \ + incx = 1; \ + incy = convert_index(resIncr); \ +\ +/* Set uplo, trans and diag*/ \ + trans = 'N'; \ + uplo = IsLower ? 'L' : 'U'; \ + diag = IsUnitDiag ? 'U' : 'N'; \ +\ +/* call ?TRMV*/ \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ +\ +/* Add op(a_tr)rhs into res*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ + if (size<(std::max)(rows,cols)) { \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ + if (size(rows-size); \ + n = convert_index(size); \ + } \ + else { \ + x += size; \ + y = _res; \ + a = _lhs + size*lda; \ + m = convert_index(size); \ + n = convert_index(cols-size); \ + } \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + } \ + } \ +}; + +EIGEN_BLAS_TRMV_CM(double, double, d, d) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_CM(float, float, f, s) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) + +// implements row-major: res += alpha * op(triangular) * vector +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +template \ +struct triangular_matrix_vector_product_trmv { \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper \ + }; \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + { \ + if (IsZeroDiag) { \ + triangular_matrix_vector_product::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + return; \ + }\ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ +\ + typedef VectorX##EIGPREFIX VectorRhs; \ + EIGTYPE *x, *y;\ +\ +/* Set x*/ \ + Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + VectorRhs x_tmp; \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ +\ +/* Square part handling */\ +\ + char trans, uplo, diag; \ + BlasIndex m, n, lda, incx, incy; \ + EIGTYPE const *a; \ + EIGTYPE beta(1); \ +\ +/* Set m, n */ \ + n = convert_index(size); \ + lda = convert_index(lhsStride); \ + incx = 1; \ + incy = convert_index(resIncr); \ +\ +/* Set uplo, trans and diag*/ \ + trans = ConjLhs ? 'C' : 'T'; \ + uplo = IsLower ? 'U' : 'L'; \ + diag = IsUnitDiag ? 'U' : 'N'; \ +\ +/* call ?TRMV*/ \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ +\ +/* Add op(a_tr)rhs into res*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ + if (size<(std::max)(rows,cols)) { \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ + if (size(rows-size); \ + n = convert_index(size); \ + } \ + else { \ + x += size; \ + y = _res; \ + a = _lhs + size; \ + m = convert_index(size); \ + n = convert_index(cols-size); \ + } \ + BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + } \ + } \ +}; + +EIGEN_BLAS_TRMV_RM(double, double, d, d) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_RM(float, float, f, s) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h deleted file mode 100644 index 17c9eeb44..000000000 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ /dev/null @@ -1,241 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Triangular matrix-vector product functionality based on ?TRMV. - ******************************************************************************** -*/ - -#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H - -namespace Eigen { - -namespace internal { - -/********************************************************************** -* This file implements triangular matrix-vector multiplication using BLAS -**********************************************************************/ - -// trmv/hemv specialization - -template -struct triangular_matrix_vector_product_trmv : - triangular_matrix_vector_product {}; - -#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \ -template \ -struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ - triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ - } \ -}; \ -template \ -struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ - triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ - } \ -}; - -EIGEN_MKL_TRMV_SPECIALIZE(double) -EIGEN_MKL_TRMV_SPECIALIZE(float) -EIGEN_MKL_TRMV_SPECIALIZE(dcomplex) -EIGEN_MKL_TRMV_SPECIALIZE(scomplex) - -// implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct triangular_matrix_vector_product_trmv { \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - LowUp = IsLower ? Lower : Upper \ - }; \ - static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ - { \ - if (ConjLhs || IsZeroDiag) { \ - triangular_matrix_vector_product::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ - return; \ - }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ -\ - typedef VectorX##EIGPREFIX VectorRhs; \ - EIGTYPE *x, *y;\ -\ -/* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ - VectorRhs x_tmp; \ - if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ - x = x_tmp.data(); \ -\ -/* Square part handling */\ -\ - char trans, uplo, diag; \ - BlasIndex m, n, lda, incx, incy; \ - EIGTYPE const *a; \ - EIGTYPE beta(1); \ -\ -/* Set m, n */ \ - n = convert_index(size); \ - lda = convert_index(lhsStride); \ - incx = 1; \ - incy = convert_index(resIncr); \ -\ -/* Set uplo, trans and diag*/ \ - trans = 'N'; \ - uplo = IsLower ? 'L' : 'U'; \ - diag = IsUnitDiag ? 'U' : 'N'; \ -\ -/* call ?TRMV*/ \ - MKLPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ -\ -/* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ - if (size<(std::max)(rows,cols)) { \ - if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ - x = x_tmp.data(); \ - if (size(rows-size); \ - n = convert_index(size); \ - } \ - else { \ - x += size; \ - y = _res; \ - a = _lhs + size*lda; \ - m = convert_index(size); \ - n = convert_index(cols-size); \ - } \ - MKLPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ - } \ - } \ -}; - -EIGEN_MKL_TRMV_CM(double, double, d, d) -EIGEN_MKL_TRMV_CM(dcomplex, double, cd, z) -EIGEN_MKL_TRMV_CM(float, float, f, s) -EIGEN_MKL_TRMV_CM(scomplex, float, cf, c) - -// implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, MKLPREFIX) \ -template \ -struct triangular_matrix_vector_product_trmv { \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - LowUp = IsLower ? Lower : Upper \ - }; \ - static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ - { \ - if (IsZeroDiag) { \ - triangular_matrix_vector_product::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ - return; \ - }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ -\ - typedef VectorX##EIGPREFIX VectorRhs; \ - EIGTYPE *x, *y;\ -\ -/* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ - VectorRhs x_tmp; \ - if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ - x = x_tmp.data(); \ -\ -/* Square part handling */\ -\ - char trans, uplo, diag; \ - BlasIndex m, n, lda, incx, incy; \ - EIGTYPE const *a; \ - EIGTYPE beta(1); \ -\ -/* Set m, n */ \ - n = convert_index(size); \ - lda = convert_index(lhsStride); \ - incx = 1; \ - incy = convert_index(resIncr); \ -\ -/* Set uplo, trans and diag*/ \ - trans = ConjLhs ? 'C' : 'T'; \ - uplo = IsLower ? 'U' : 'L'; \ - diag = IsUnitDiag ? 'U' : 'N'; \ -\ -/* call ?TRMV*/ \ - MKLPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ -\ -/* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ - if (size<(std::max)(rows,cols)) { \ - if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ - x = x_tmp.data(); \ - if (size(rows-size); \ - n = convert_index(size); \ - } \ - else { \ - x += size; \ - y = _res; \ - a = _lhs + size; \ - m = convert_index(size); \ - n = convert_index(cols-size); \ - } \ - MKLPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ - } \ - } \ -}; - -EIGEN_MKL_TRMV_RM(double, double, d, d) -EIGEN_MKL_TRMV_RM(dcomplex, double, cd, z) -EIGEN_MKL_TRMV_RM(float, float, f, s) -EIGEN_MKL_TRMV_RM(scomplex, float, cf, c) - -} // end namespase internal - -} // end namespace Eigen - -#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h new file mode 100644 index 000000000..88c0fb794 --- /dev/null +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -0,0 +1,151 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix * matrix product functionality based on ?TRMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + +// implements LeftSide op(triangular)^-1 * general +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ +template \ +struct triangular_solve_matrix \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ + }; \ + static void run( \ + Index size, Index otherSize, \ + const EIGTYPE* _tri, Index triStride, \ + EIGTYPE* _other, Index otherStride, level3_blocking& /*blocking*/) \ + { \ + BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ + char side = 'L', uplo, diag='N', transa; \ + /* Set alpha_ */ \ + EIGTYPE alpha(1); \ + ldb = convert_index(otherStride);\ +\ + const EIGTYPE *a; \ +/* Set trans */ \ + transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + typedef Matrix MatrixTri; \ + Map > tri(_tri,size,size,OuterStride<>(triStride)); \ + MatrixTri a_tmp; \ +\ + if (conjA) { \ + a_tmp = tri.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else { \ + a = _tri; \ + lda = convert_index(triStride); \ + } \ + if (IsUnitDiag) diag='U'; \ +/* call ?trsm*/ \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + } \ +}; + +EIGEN_BLAS_TRSM_L(double, double, d) +EIGEN_BLAS_TRSM_L(dcomplex, double, z) +EIGEN_BLAS_TRSM_L(float, float, s) +EIGEN_BLAS_TRSM_L(scomplex, float, c) + + +// implements RightSide general * op(triangular)^-1 +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ +template \ +struct triangular_solve_matrix \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ + }; \ + static void run( \ + Index size, Index otherSize, \ + const EIGTYPE* _tri, Index triStride, \ + EIGTYPE* _other, Index otherStride, level3_blocking& /*blocking*/) \ + { \ + BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ + char side = 'R', uplo, diag='N', transa; \ + /* Set alpha_ */ \ + EIGTYPE alpha(1); \ + ldb = convert_index(otherStride);\ +\ + const EIGTYPE *a; \ +/* Set trans */ \ + transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + typedef Matrix MatrixTri; \ + Map > tri(_tri,size,size,OuterStride<>(triStride)); \ + MatrixTri a_tmp; \ +\ + if (conjA) { \ + a_tmp = tri.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index(a_tmp.outerStride()); \ + } else { \ + a = _tri; \ + lda = convert_index(triStride); \ + } \ + if (IsUnitDiag) diag='U'; \ +/* call ?trsm*/ \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + /*std::cout << "TRMS_L specialization!\n";*/ \ + } \ +}; + +EIGEN_BLAS_TRSM_R(double, double, d) +EIGEN_BLAS_TRSM_R(dcomplex, double, z) +EIGEN_BLAS_TRSM_R(float, float, s) +EIGEN_BLAS_TRSM_R(scomplex, float, c) + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h deleted file mode 100644 index 1f68a1cec..000000000 --- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +++ /dev/null @@ -1,151 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL - * Triangular matrix * matrix product functionality based on ?TRMM. - ******************************************************************************** -*/ - -#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H - -namespace Eigen { - -namespace internal { - -// implements LeftSide op(triangular)^-1 * general -#define EIGEN_MKL_TRSM_L(EIGTYPE, BLASTYPE, MKLPREFIX) \ -template \ -struct triangular_solve_matrix \ -{ \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ - }; \ - static void run( \ - Index size, Index otherSize, \ - const EIGTYPE* _tri, Index triStride, \ - EIGTYPE* _other, Index otherStride, level3_blocking& /*blocking*/) \ - { \ - BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ - char side = 'L', uplo, diag='N', transa; \ - /* Set alpha_ */ \ - EIGTYPE alpha(1); \ - ldb = convert_index(otherStride);\ -\ - const EIGTYPE *a; \ -/* Set trans */ \ - transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ -/* Set uplo */ \ - uplo = IsLower ? 'L' : 'U'; \ - if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ -/* Set a, lda */ \ - typedef Matrix MatrixTri; \ - Map > tri(_tri,size,size,OuterStride<>(triStride)); \ - MatrixTri a_tmp; \ -\ - if (conjA) { \ - a_tmp = tri.conjugate(); \ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else { \ - a = _tri; \ - lda = convert_index(triStride); \ - } \ - if (IsUnitDiag) diag='U'; \ -/* call ?trsm*/ \ - MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ - } \ -}; - -EIGEN_MKL_TRSM_L(double, double, d) -EIGEN_MKL_TRSM_L(dcomplex, double, z) -EIGEN_MKL_TRSM_L(float, float, s) -EIGEN_MKL_TRSM_L(scomplex, float, c) - - -// implements RightSide general * op(triangular)^-1 -#define EIGEN_MKL_TRSM_R(EIGTYPE, BLASTYPE, MKLPREFIX) \ -template \ -struct triangular_solve_matrix \ -{ \ - enum { \ - IsLower = (Mode&Lower) == Lower, \ - IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ - IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ - conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ - }; \ - static void run( \ - Index size, Index otherSize, \ - const EIGTYPE* _tri, Index triStride, \ - EIGTYPE* _other, Index otherStride, level3_blocking& /*blocking*/) \ - { \ - BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ - char side = 'R', uplo, diag='N', transa; \ - /* Set alpha_ */ \ - EIGTYPE alpha(1); \ - ldb = convert_index(otherStride);\ -\ - const EIGTYPE *a; \ -/* Set trans */ \ - transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ -/* Set uplo */ \ - uplo = IsLower ? 'L' : 'U'; \ - if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ -/* Set a, lda */ \ - typedef Matrix MatrixTri; \ - Map > tri(_tri,size,size,OuterStride<>(triStride)); \ - MatrixTri a_tmp; \ -\ - if (conjA) { \ - a_tmp = tri.conjugate(); \ - a = a_tmp.data(); \ - lda = convert_index(a_tmp.outerStride()); \ - } else { \ - a = _tri; \ - lda = convert_index(triStride); \ - } \ - if (IsUnitDiag) diag='U'; \ -/* call ?trsm*/ \ - MKLPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ - /*std::cout << "TRMS_L specialization!\n";*/ \ - } \ -}; - -EIGEN_MKL_TRSM_R(double, double, d) -EIGEN_MKL_TRSM_R(dcomplex, double, z) -EIGEN_MKL_TRSM_R(float, float, s) -EIGEN_MKL_TRSM_R(scomplex, float, c) - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H -- cgit v1.2.3 From 097d1e8823629f14f8cc57ab065e7ab6fb653d0f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 16:09:29 +0200 Subject: Cleanup obsolete assign_scalar_eig2mkl helper. --- .../products/GeneralMatrixMatrixTriangular_BLAS.h | 11 ++---- Eigen/src/Core/util/MKL_support.h | 40 ---------------------- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 943d25bd1..911df8ff3 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -87,12 +87,8 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - BLASTYPE alpha_, beta_; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, EIGTYPE(1)); \ - BLASFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ + EIGTYPE beta; \ + BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ } \ }; @@ -115,9 +111,6 @@ struct general_matrix_matrix_rankupdate(alpha_, alpha); */\ -/* assign_scalar_eig2mkl(beta_, EIGTYPE(1));*/ \ alpha_ = alpha.real(); \ beta_ = 1.0; \ /* Copy with conjugation in some cases*/ \ diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 382014e66..bf47a626b 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -119,46 +119,6 @@ typedef int BlasIndex; typedef MKL_INT BlasIndex; #endif -namespace internal { - -template -static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -template -static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -#ifdef EIGEN_USE_MKL -template <> -inline void assign_scalar_eig2mkl(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_scalar_eig2mkl(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} -#endif - -} // end namespace internal - } // end namespace Eigen #if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) -- cgit v1.2.3 From 048343028371a8b7f79e4007a48caa8aff83e0de Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 17:12:31 +0200 Subject: Move LAPACK declarations from blas.h to lapack.h and fix compatibility with EIGEN_USE_MKL --- Eigen/src/Core/util/MKL_support.h | 8 +- Eigen/src/misc/blas.h | 176 ++++---------------------------------- Eigen/src/misc/lapack.h | 152 ++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 162 deletions(-) create mode 100644 Eigen/src/misc/lapack.h diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index bf47a626b..8c9239b1d 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -113,15 +113,15 @@ namespace Eigen { typedef std::complex dcomplex; typedef std::complex scomplex; -#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) -typedef int BlasIndex; -#else +#if defined(EIGEN_USE_MKL) typedef MKL_INT BlasIndex; +#else +typedef int BlasIndex; #endif } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#if defined(EIGEN_USE_BLAS) #include "../../misc/blas.h" #endif diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h index ae0c393f1..25215b15e 100644 --- a/Eigen/src/misc/blas.h +++ b/Eigen/src/misc/blas.h @@ -30,15 +30,15 @@ int BLASFUNC(cdotcw) (int *, float *, int *, float *, int *, float*); int BLASFUNC(zdotuw) (int *, double *, int *, double *, int *, double*); int BLASFUNC(zdotcw) (int *, double *, int *, double *, int *, double*); -int BLASFUNC(saxpy) (const int *, const float *, const float *, const int *, float *, int *); -int BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, int *); -int BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, int *); -int BLASFUNC(caxpy) (const int *, const float *, const float *, const int *, float *, int *); -int BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, int *); -int BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, int *); -int BLASFUNC(caxpyc)(const int *, const float *, const float *, const int *, float *, int *); -int BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, int *); -int BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, int *); +int BLASFUNC(saxpy) (const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(caxpy) (const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(caxpyc)(const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *); int BLASFUNC(scopy) (int *, float *, int *, float *, int *); int BLASFUNC(dcopy) (int *, double *, int *, double *, int *); @@ -229,9 +229,6 @@ int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, doub int BLASFUNC(ssymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -int BLASFUNC(csymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(sspmv) (char *, int *, float *, float *, float *, int *, float *, float *, int *); @@ -239,38 +236,17 @@ int BLASFUNC(dspmv) (char *, int *, double *, double *, double *, int *, double *, double *, int *); int BLASFUNC(qspmv) (char *, int *, double *, double *, double *, int *, double *, double *, int *); -int BLASFUNC(cspmv) (char *, int *, float *, float *, - float *, int *, float *, float *, int *); -int BLASFUNC(zspmv) (char *, int *, double *, double *, - double *, int *, double *, double *, int *); -int BLASFUNC(xspmv) (char *, int *, double *, double *, - double *, int *, double *, double *, int *); -int BLASFUNC(ssyr) (char *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(dsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(qsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(csyr) (char *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(zsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(xsyr) (char *, int *, double *, double *, int *, - double *, int *); +int BLASFUNC(ssyr) (const char *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dsyr) (const char *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qsyr) (const char *, const int *, const double *, const double *, const int *, double *, const int *); -int BLASFUNC(ssyr2) (char *, int *, float *, - float *, int *, float *, int *, float *, int *); -int BLASFUNC(dsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(qsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(csyr2) (char *, int *, float *, - float *, int *, float *, int *, float *, int *); -int BLASFUNC(zsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(xsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); +int BLASFUNC(ssyr2) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(csyr2) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(zsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(sspr) (char *, int *, float *, float *, int *, float *); @@ -278,12 +254,6 @@ int BLASFUNC(dspr) (char *, int *, double *, double *, int *, double *); int BLASFUNC(qspr) (char *, int *, double *, double *, int *, double *); -int BLASFUNC(cspr) (char *, int *, float *, float *, int *, - float *); -int BLASFUNC(zspr) (char *, int *, double *, double *, int *, - double *); -int BLASFUNC(xspr) (char *, int *, double *, double *, int *, - double *); int BLASFUNC(sspr2) (char *, int *, float *, float *, int *, float *, int *, float *); @@ -462,116 +432,6 @@ int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, cons int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); -int BLASFUNC(sgemt)(char *, int *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(cgemt)(char *, int *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *, - double *, int *); - -int BLASFUNC(sgema)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(dgema)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); -int BLASFUNC(cgema)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(zgema)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); - -int BLASFUNC(sgems)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(dgems)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); -int BLASFUNC(cgems)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(zgems)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); - -int BLASFUNC(sgetf2)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(cgetf2)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *); - -int BLASFUNC(sgetrf)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(cgetrf)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *); - -int BLASFUNC(slaswp)(int *, float *, int *, int *, int *, int *, int *); -int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(claswp)(int *, float *, int *, int *, int *, int *, int *); -int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *); - -int BLASFUNC(sgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(cgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); - -int BLASFUNC(sgesv)(int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(cgesv)(int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *); - -int BLASFUNC(spotf2)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotf2)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *); - -int BLASFUNC(spotrf)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotrf)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *); - -int BLASFUNC(slauu2)(char *, int *, float *, int *, int *); -int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(clauu2)(char *, int *, float *, int *, int *); -int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *); - -int BLASFUNC(slauum)(char *, int *, float *, int *, int *); -int BLASFUNC(dlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(qlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(clauum)(char *, int *, float *, int *, int *); -int BLASFUNC(zlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(xlauum)(char *, int *, double *, int *, int *); - -int BLASFUNC(strti2)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(ctrti2)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *); - -int BLASFUNC(strtri)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(ctrtri)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *); - -int BLASFUNC(spotri)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotri)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotri)(char *, int *, double *, int *, int *); #ifdef __cplusplus } diff --git a/Eigen/src/misc/lapack.h b/Eigen/src/misc/lapack.h new file mode 100644 index 000000000..249f3575c --- /dev/null +++ b/Eigen/src/misc/lapack.h @@ -0,0 +1,152 @@ +#ifndef LAPACK_H +#define LAPACK_H + +#include "blas.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +int BLASFUNC(csymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + + +int BLASFUNC(cspmv) (char *, int *, float *, float *, + float *, int *, float *, float *, int *); +int BLASFUNC(zspmv) (char *, int *, double *, double *, + double *, int *, double *, double *, int *); +int BLASFUNC(xspmv) (char *, int *, double *, double *, + double *, int *, double *, double *, int *); + +int BLASFUNC(csyr) (char *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(zsyr) (char *, int *, double *, double *, int *, + double *, int *); +int BLASFUNC(xsyr) (char *, int *, double *, double *, int *, + double *, int *); + +int BLASFUNC(cspr) (char *, int *, float *, float *, int *, + float *); +int BLASFUNC(zspr) (char *, int *, double *, double *, int *, + double *); +int BLASFUNC(xspr) (char *, int *, double *, double *, int *, + double *); + +int BLASFUNC(sgemt)(char *, int *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *, + double *, int *); +int BLASFUNC(cgemt)(char *, int *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *, + double *, int *); + +int BLASFUNC(sgema)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(dgema)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); +int BLASFUNC(cgema)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(zgema)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); + +int BLASFUNC(sgems)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(dgems)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); +int BLASFUNC(cgems)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(zgems)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); + +int BLASFUNC(sgetf2)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(cgetf2)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *); + +int BLASFUNC(sgetrf)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(cgetrf)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *); + +int BLASFUNC(slaswp)(int *, float *, int *, int *, int *, int *, int *); +int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(claswp)(int *, float *, int *, int *, int *, int *, int *); +int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *); + +int BLASFUNC(sgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(cgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); + +int BLASFUNC(sgesv)(int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(cgesv)(int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *); + +int BLASFUNC(spotf2)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotf2)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *); + +int BLASFUNC(spotrf)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotrf)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *); + +int BLASFUNC(slauu2)(char *, int *, float *, int *, int *); +int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(clauu2)(char *, int *, float *, int *, int *); +int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *); + +int BLASFUNC(slauum)(char *, int *, float *, int *, int *); +int BLASFUNC(dlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(qlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(clauum)(char *, int *, float *, int *, int *); +int BLASFUNC(zlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(xlauum)(char *, int *, double *, int *, int *); + +int BLASFUNC(strti2)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(ctrti2)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *); + +int BLASFUNC(strtri)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(ctrtri)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *); + +int BLASFUNC(spotri)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotri)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotri)(char *, int *, double *, int *, int *); + +#ifdef __cplusplus +} +#endif + +#endif -- cgit v1.2.3 From 91bf925fc17c50a7898d84e56ce3fbbd93e7d920 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 17:13:01 +0200 Subject: Improve constness of level2 blas API. --- blas/common.h | 10 +++++----- blas/level1_impl.h | 6 +++--- blas/level2_cplx_impl.h | 13 +++++++------ blas/level2_impl.h | 33 +++++++++++++++++---------------- blas/level2_real_impl.h | 33 +++++++++++++++++---------------- lapack/lapack_common.h | 1 + 6 files changed, 50 insertions(+), 46 deletions(-) diff --git a/blas/common.h b/blas/common.h index acb50af1b..61d8344d9 100644 --- a/blas/common.h +++ b/blas/common.h @@ -10,8 +10,8 @@ #ifndef EIGEN_BLAS_COMMON_H #define EIGEN_BLAS_COMMON_H -#include -#include +#include "../Eigen/Core" +#include "../Eigen/Jacobi" #include @@ -19,8 +19,7 @@ #error the token SCALAR must be defined to compile this file #endif -#include - +#include "../Eigen/src/misc/blas.h" #define NOTR 0 #define TR 1 @@ -94,6 +93,7 @@ enum typedef Matrix PlainMatrixType; typedef Map, 0, OuterStride<> > MatrixType; +typedef Map, 0, OuterStride<> > ConstMatrixType; typedef Map, 0, InnerStride > StridedVectorType; typedef Map > CompactVectorType; @@ -141,7 +141,7 @@ T* get_compact_vector(T* x, int n, int incx) if(incx==1) return x; - T* ret = new Scalar[n]; + typename Eigen::internal::remove_const::type* ret = new Scalar[n]; if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse(); else make_vector(ret,n) = make_vector(x,n, incx); return ret; diff --git a/blas/level1_impl.h b/blas/level1_impl.h index e623bd178..f857bfa20 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -9,11 +9,11 @@ #include "common.h" -int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy) +int EIGEN_BLAS_FUNC(axpy)(const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *py, const int *incy) { - Scalar* x = reinterpret_cast(px); + const Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); if(*n<=0) return 0; diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index 2edc51596..e3ce61435 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -16,7 +16,8 @@ * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian matrix. */ -int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) +int EIGEN_BLAS_FUNC(hemv)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda, + const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy) { typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar); static const functype func[2] = { @@ -26,11 +27,11 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa (internal::selfadjoint_matrix_vector_product::run), }; - Scalar* a = reinterpret_cast(pa); - Scalar* x = reinterpret_cast(px); + const Scalar* a = reinterpret_cast(pa); + const Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); // check arguments int info = 0; @@ -45,7 +46,7 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa if(*n==0) return 1; - Scalar* actual_x = get_compact_vector(x,*n,*incx); + const Scalar* actual_x = get_compact_vector(x,*n,*incx); Scalar* actual_y = get_compact_vector(y,*n,*incy); if(beta!=Scalar(1)) diff --git a/blas/level2_impl.h b/blas/level2_impl.h index d09db0cc6..173f40b44 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -23,7 +23,8 @@ struct general_matrix_vector_product_wrapper } }; -int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc) +int EIGEN_BLAS_FUNC(gemv)(const char *opa, const int *m, const int *n, const RealScalar *palpha, + const RealScalar *pa, const int *lda, const RealScalar *pb, const int *incb, const RealScalar *pbeta, RealScalar *pc, const int *incc) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar); static const functype func[4] = { @@ -36,11 +37,11 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca 0 }; - Scalar* a = reinterpret_cast(pa); - Scalar* b = reinterpret_cast(pb); + const Scalar* a = reinterpret_cast(pa); + const Scalar* b = reinterpret_cast(pb); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); // check arguments int info = 0; @@ -62,7 +63,7 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca if(code!=NOTR) std::swap(actual_m,actual_n); - Scalar* actual_b = get_compact_vector(b,actual_n,*incb); + const Scalar* actual_b = get_compact_vector(b,actual_n,*incb); Scalar* actual_c = get_compact_vector(c,actual_m,*incc); if(beta!=Scalar(1)) @@ -82,7 +83,7 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca return 1; } -int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) +int EIGEN_BLAS_FUNC(trsv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb) { typedef void (*functype)(int, const Scalar *, int, Scalar *); static const functype func[16] = { @@ -116,7 +117,7 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar 0 }; - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); int info = 0; @@ -141,7 +142,7 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar -int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) +int EIGEN_BLAS_FUNC(trmv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&); static const functype func[16] = { @@ -175,7 +176,7 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar 0 }; - Scalar* a = reinterpret_cast(pa); + const Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); int info = 0; @@ -217,11 +218,11 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) { - Scalar* a = reinterpret_cast(pa); - Scalar* x = reinterpret_cast(px); + const Scalar* a = reinterpret_cast(pa); + const Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); int coeff_rows = *kl+*ku+1; int info = 0; @@ -244,7 +245,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca if(OP(*trans)!=NOTR) std::swap(actual_m,actual_n); - Scalar* actual_x = get_compact_vector(x,actual_n,*incx); + const Scalar* actual_x = get_compact_vector(x,actual_n,*incx); Scalar* actual_y = get_compact_vector(y,actual_m,*incy); if(beta!=Scalar(1)) @@ -253,7 +254,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca else make_vector(actual_y, actual_m) *= beta; } - MatrixType mat_coeffs(a,coeff_rows,*n,*lda); + ConstMatrixType mat_coeffs(a,coeff_rows,*n,*lda); int nb = std::min(*n,(*m)+(*ku)); for(int j=0; j::run), }; - Scalar* a = reinterpret_cast(pa); - Scalar* x = reinterpret_cast(px); + const Scalar* a = reinterpret_cast(pa); + const Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - Scalar alpha = *reinterpret_cast(palpha); - Scalar beta = *reinterpret_cast(pbeta); + Scalar alpha = *reinterpret_cast(palpha); + Scalar beta = *reinterpret_cast(pbeta); // check arguments int info = 0; @@ -39,7 +40,7 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p if(*n==0) return 0; - Scalar* actual_x = get_compact_vector(x,*n,*incx); + const Scalar* actual_x = get_compact_vector(x,*n,*incx); Scalar* actual_y = get_compact_vector(y,*n,*incy); if(beta!=Scalar(1)) @@ -61,7 +62,7 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p } // C := alpha*x*x' + C -int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(syr)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *pc, const int *ldc) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&); @@ -72,9 +73,9 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, (selfadjoint_rank1_update::run), }; - Scalar* x = reinterpret_cast(px); + const Scalar* x = reinterpret_cast(px); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); int info = 0; if(UPLO(*uplo)==INVALID) info = 1; @@ -87,7 +88,7 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, if(*n==0 || alpha==Scalar(0)) return 1; // if the increment is not 1, let's copy it to a temporary vector to enable vectorization - Scalar* x_cpy = get_compact_vector(x,*n,*incx); + const Scalar* x_cpy = get_compact_vector(x,*n,*incx); int code = UPLO(*uplo); if(code>=2 || func[code]==0) @@ -101,7 +102,7 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, } // C := alpha*x*y' + alpha*y*x' + C -int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc) +int EIGEN_BLAS_FUNC(syr2)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, const RealScalar *py, const int *incy, RealScalar *pc, const int *ldc) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar); static const functype func[2] = { @@ -111,10 +112,10 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px (internal::rank2_update_selector::run), }; - Scalar* x = reinterpret_cast(px); - Scalar* y = reinterpret_cast(py); + const Scalar* x = reinterpret_cast(px); + const Scalar* y = reinterpret_cast(py); Scalar* c = reinterpret_cast(pc); - Scalar alpha = *reinterpret_cast(palpha); + Scalar alpha = *reinterpret_cast(palpha); int info = 0; if(UPLO(*uplo)==INVALID) info = 1; @@ -128,8 +129,8 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px if(alpha==Scalar(0)) return 1; - Scalar* x_cpy = get_compact_vector(x,*n,*incx); - Scalar* y_cpy = get_compact_vector(y,*n,*incy); + const Scalar* x_cpy = get_compact_vector(x,*n,*incx); + const Scalar* y_cpy = get_compact_vector(y,*n,*incy); int code = UPLO(*uplo); if(code>=2 || func[code]==0) diff --git a/lapack/lapack_common.h b/lapack/lapack_common.h index a93598784..c872a813e 100644 --- a/lapack/lapack_common.h +++ b/lapack/lapack_common.h @@ -11,6 +11,7 @@ #define EIGEN_LAPACK_COMMON_H #include "../blas/common.h" +#include "../Eigen/src/misc/lapack.h" #define EIGEN_LAPACK_FUNC(FUNC,ARGLIST) \ extern "C" { int EIGEN_BLAS_FUNC(FUNC) ARGLIST; } \ -- cgit v1.2.3 From 1744b5b5d2a488706cb26ff608741548d4853aa4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Apr 2016 17:16:07 +0200 Subject: Update doc regarding the genericity of EIGEN_USE_BLAS --- doc/UsingIntelMKL.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 02c62ad85..dbe559e53 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -55,7 +55,7 @@ Operations on other scalar types or mixing reals and complexes will continue to In addition you can choose which parts will be substituted by defining one or multiple of the following macros: - + -- cgit v1.2.3 From 833efb39bfe4957934982112fe435ab30a0c3b4f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Apr 2016 11:03:56 -0700 Subject: Added epsilon, dummy_precision, infinity and quiet_NaN NumTraits for fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 3be7e88d7..281b8e4c6 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -366,13 +366,22 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return internal::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { return internal::raw_uint16_to_half(0x7bff); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { return internal::raw_uint16_to_half(0xfbff); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return internal::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return internal::raw_uint16_to_half(0x7c01); + } }; // Infinity/NaN checks. -- cgit v1.2.3
\c EIGEN_USE_BLAS Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)
\c EIGEN_USE_BLAS Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface, not only Intel MKL)
\c EIGEN_USE_LAPACKE Enables the use of external Lapack routines via the Intel Lapacke C interface to Lapack (currently works with Intel MKL only)
\c EIGEN_USE_LAPACKE_STRICT Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.
\c EIGEN_USE_MKL_VML Enables the use of Intel VML (vector operations)