From 0f83aeb6b22840c21c3fc2b90d3af18a65a0798a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 14 Apr 2017 10:22:12 +0200 Subject: Improve cmake scripts for Pastix and BLAS detection. --- test/CMakeLists.txt | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'test/CMakeLists.txt') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d337594f5..8da51ce57 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,7 +27,7 @@ endif() if(NOT EIGEN_Fortran_COMPILER_WORKS) # search for a default Lapack library to complete Eigen's one - find_package(LAPACK) + find_package(LAPACK QUIET) endif() # configure blas/lapack (use Eigen's ones) @@ -80,23 +80,30 @@ else() endif() -find_package(Pastix) -find_package(Scotch) -find_package(Metis 5.0 REQUIRED) -if(PASTIX_FOUND) +find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +# check that the PASTIX found is a version without MPI +find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS + NAMES pastix_nompi.h + HINTS ${PASTIX_INCLUDE_DIRS} +) +if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS) + message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory." + " Because Eigen tests require a version without MPI, we disable the Pastix backend.") +endif() +if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS) add_definitions("-DEIGEN_PASTIX_SUPPORT") - include_directories(${PASTIX_INCLUDES}) + include_directories(${PASTIX_INCLUDE_DIRS_DEP}) if(SCOTCH_FOUND) - include_directories(${SCOTCH_INCLUDES}) + include_directories(${SCOTCH_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES}) elseif(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES}) else(SCOTCH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) ei_add_property(EIGEN_TESTED_BACKENDS "PaStiX, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") @@ -104,7 +111,7 @@ endif() if(METIS_FOUND) add_definitions("-DEIGEN_METIS_SUPPORT") - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) ei_add_property(EIGEN_TESTED_BACKENDS "METIS, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "METIS, ") -- cgit v1.2.3 From 498aa95a8b2b505c919efff8857a6fd2a5bbc172 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 11:53:49 +0200 Subject: bug #1424: add numext::abs specialization for unsigned integer types. --- Eigen/src/Core/MathFunctions.h | 10 +++++++- test/CMakeLists.txt | 1 + test/main.h | 11 +++++++++ test/numext.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/numext.cpp (limited to 'test/CMakeLists.txt') diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 5ec6c395e..0be4a25da 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1232,11 +1232,19 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits::Real abs(const T &x) { +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} + #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8da51ce57..e73ab92b4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -148,6 +148,7 @@ add_custom_target(BuildOfficial) ei_add_test(rand) ei_add_test(meta) +ei_add_test(numext) ei_add_test(sizeof) ei_add_test(dynalloc) ei_add_test(nomalloc) diff --git a/test/main.h b/test/main.h index 25d2dcf43..bd5325196 100644 --- a/test/main.h +++ b/test/main.h @@ -310,6 +310,17 @@ template<> inline float test_precision >() { return test_pre template<> inline double test_precision >() { return test_precision(); } template<> inline long double test_precision >() { return test_precision(); } +inline bool test_isApprox(const short& a, const short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned short& a, const unsigned short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned int& a, const unsigned int& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const long& a, const long& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned long& a, const unsigned long& b) +{ return internal::isApprox(a, b, test_precision()); } + inline bool test_isApprox(const int& a, const int& b) { return internal::isApprox(a, b, test_precision()); } inline bool test_isMuchSmallerThan(const int& a, const int& b) diff --git a/test/numext.cpp b/test/numext.cpp new file mode 100644 index 000000000..3de33e2f9 --- /dev/null +++ b/test/numext.cpp @@ -0,0 +1,53 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template +void check_abs() { + typedef typename NumTraits::Real Real; + + if(NumTraits::IsSigned) + VERIFY_IS_EQUAL(numext::abs(-T(1)), T(1)); + VERIFY_IS_EQUAL(numext::abs(T(0)), T(0)); + VERIFY_IS_EQUAL(numext::abs(T(1)), T(1)); + + for(int k=0; k(); + if(!internal::is_same::value) + x = x/Real(2); + if(NumTraits::IsSigned) + { + VERIFY_IS_EQUAL(numext::abs(x), numext::abs(-x)); + VERIFY( numext::abs(-x) >= Real(0)); + } + VERIFY( numext::abs(x) >= Real(0)); + VERIFY_IS_APPROX( numext::abs2(x), numext::abs2(numext::abs(x)) ); + } +} + +void test_numext() { + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + + CALL_SUBTEST( check_abs >() ); + CALL_SUBTEST( check_abs >() ); +} -- cgit v1.2.3 From c0e1d510fd7086a7e79987b7df0a789a4457c5dc Mon Sep 17 00:00:00 2001 From: Kyle Vedder Date: Wed, 4 Oct 2017 21:01:23 -0500 Subject: Add support for SuiteSparse's KLU routines --- cmake/FindKLU.cmake | 51 ++++ test/CMakeLists.txt | 16 ++ test/klu_support.cpp | 32 +++ unsupported/Eigen/KLUSupport | 41 +++ unsupported/Eigen/src/KLUSupport/KLUSupport.h | 364 ++++++++++++++++++++++++++ 5 files changed, 504 insertions(+) create mode 100644 cmake/FindKLU.cmake create mode 100644 test/klu_support.cpp create mode 100644 unsupported/Eigen/KLUSupport create mode 100644 unsupported/Eigen/src/KLUSupport/KLUSupport.h (limited to 'test/CMakeLists.txt') diff --git a/cmake/FindKLU.cmake b/cmake/FindKLU.cmake new file mode 100644 index 000000000..2783b63d2 --- /dev/null +++ b/cmake/FindKLU.cmake @@ -0,0 +1,51 @@ +# KLU lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (KLU_INCLUDES AND KLU_LIBRARIES) + set(KLU_FIND_QUIETLY TRUE) +endif (KLU_INCLUDES AND KLU_LIBRARIES) + +find_path(KLU_INCLUDES + NAMES + klu.h + PATHS + $ENV{KLUDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +if(KLU_LIBRARIES) + + if(NOT KLU_LIBDIR) + get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH) + endif(NOT KLU_LIBDIR) + + find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(COLAMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY}) + endif () + + find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(AMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY}) + endif () + + find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(SUITESPARSE_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${SUITESPARSE_LIBRARY}) + endif () + + find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(CHOLMOD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${CHOLMOD_LIBRARY}) + endif() + +endif(KLU_LIBRARIES) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(KLU DEFAULT_MSG + KLU_INCLUDES KLU_LIBRARIES) + +mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e73ab92b4..8bd086ce3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,6 +68,17 @@ else() ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ") endif() +find_package(KLU) +if(KLU_FOUND) + add_definitions("-DEIGEN_KLU_SUPPORT") + include_directories(${KLU_INCLUDES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") +endif() + find_package(SuperLU 4.0) if(SUPERLU_FOUND) add_definitions("-DEIGEN_SUPERLU_SUPPORT") @@ -297,6 +308,11 @@ if(UMFPACK_FOUND) ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") endif() +if(KLU_FOUND OR SuiteSparse_FOUND) + message("ADDING KLU TEST") + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") +endif() + if(SUPERLU_FOUND) ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") endif() diff --git a/test/klu_support.cpp b/test/klu_support.cpp new file mode 100644 index 000000000..8b1fdeb41 --- /dev/null +++ b/test/klu_support.cpp @@ -0,0 +1,32 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS +#include "sparse_solver.h" + +#include + +template void test_klu_support_T() +{ + KLU > klu_colmajor; + KLU > klu_rowmajor; + + check_sparse_square_solving(klu_colmajor); + check_sparse_square_solving(klu_rowmajor); + + //check_sparse_square_determinant(umfpack_colmajor); + //check_sparse_square_determinant(umfpack_rowmajor); +} + +void test_klu_support() +{ + CALL_SUBTEST_1(test_klu_support_T()); + CALL_SUBTEST_2(test_klu_support_T >()); +} + diff --git a/unsupported/Eigen/KLUSupport b/unsupported/Eigen/KLUSupport new file mode 100644 index 000000000..b23d90535 --- /dev/null +++ b/unsupported/Eigen/KLUSupport @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#define EIGEN_KLUSUPPORT_MODULE_H + +#include + +#include + +extern "C" { +#include +#include + } + +/** \ingroup Support_modules + * \defgroup KLUSupport_Module KLUSupport module + * + * This module provides an interface to the KLU library which is part of the suitesparse package. + * It provides the following factorization class: + * - class KLU: a sparse LU factorization, well-suited for circuit simulation. + * + * \code + * #include + * \endcode + * + * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. + * The dependencies depend on how umfpack has been compiled. + * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. + * + */ + +#include "src/KLUSupport/KLUSupport.h" + +#include + +#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/unsupported/Eigen/src/KLUSupport/KLUSupport.h b/unsupported/Eigen/src/KLUSupport/KLUSupport.h new file mode 100644 index 000000000..d2781202e --- /dev/null +++ b/unsupported/Eigen/src/KLUSupport/KLUSupport.h @@ -0,0 +1,364 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Kyle Macfarlan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_H +#define EIGEN_KLUSUPPORT_H + +namespace Eigen { + +/* TODO extract L, extract U, compute det, etc... */ + +/** \ingroup KLUSupport_Module + * \brief A sparse LU factorization and solver based on KLU + * + * This class allows to solve for A.X = B sparse linear problems via a LU factorization + * using the KLU library. The sparse matrix A must be squared and full rank. + * The vectors or matrices X and B can be either dense or sparse. + * + * \warning The input matrix A should be in a \b compressed and \b column-major form. + * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. + * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class SparseLU + */ + + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, ldim, nrhs, B, Common); +} + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, ldim, nrhs, B, Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), 0, Common); +} + +inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { + return klu_factor(Ap, Ai, Ax, Symbolic, Common); +} + +inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { + return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); +} + + +template +class KLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix Vector; + typedef Matrix IntRowVectorType; + typedef Matrix IntColVectorType; + typedef SparseMatrix LUMatrixType; + typedef SparseMatrix KLUMatrixType; + typedef Ref KLUMatrixRef; + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + public: + + KLU() + : m_dummy(0,0), mp_matrix(m_dummy) + { + init(); + } + + template + explicit KLU(const InputMatrixType& matrix) + : mp_matrix(matrix) + { + init(); + compute(matrix); + } + + ~KLU() + { + if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); + if(m_numeric) klu_free_numeric(&m_numeric,&m_common); + } + + inline Index rows() const { return mp_matrix.rows(); } + inline Index cols() const { return mp_matrix.cols(); } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was succesful, + * \c NumericalIssue if the matrix.appears to be negative. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } + + inline const LUMatrixType& matrixL() const + { + if (m_extractedDataAreDirty) extractData(); + return m_l; + } + + inline const LUMatrixType& matrixU() const + { + if (m_extractedDataAreDirty) extractData(); + return m_u; + } + + inline const IntColVectorType& permutationP() const + { + if (m_extractedDataAreDirty) extractData(); + return m_p; + } + + inline const IntRowVectorType& permutationQ() const + { + if (m_extractedDataAreDirty) extractData(); + return m_q; + } + + /** Computes the sparse Cholesky decomposition of \a matrix + * Note that the matrix should be column-major, and in compressed format for best performance. + * \sa SparseMatrix::makeCompressed(). + */ + template + void compute(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + grab(matrix.derived()); + analyzePattern_impl(); + factorize_impl(); + } + + /** Performs a symbolic decomposition on the sparcity of \a matrix. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize(), compute() + */ + template + void analyzePattern(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + + grab(matrix.derived()); + + analyzePattern_impl(); + } + + + /** Provides access to the control settings array used by KLU. + * + * See KLU documentation for details. + */ + inline const klu_common& kluCommon() const + { + return m_common; + } + + /** Provides access to the control settings array used by UmfPack. + * + * If this array contains NaN's, the default values are used. + * + * See KLU documentation for details. + */ + inline klu_common& kluCommon() + { + return m_common; + } + + /** Performs a numeric decomposition of \a matrix + * + * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. + * + * \sa analyzePattern(), compute() + */ + template + void factorize(const InputMatrixType& matrix) + { + eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); + if(m_numeric) + klu_free_numeric(&m_numeric,&m_common); + + grab(matrix.derived()); + + factorize_impl(); + } + + /** \internal */ + template + bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; + + Scalar determinant() const; + + void extractData() const; + + protected: + + void init() + { + m_info = InvalidInput; + m_isInitialized = false; + m_numeric = 0; + m_symbolic = 0; + m_extractedDataAreDirty = true; + + klu_defaults(&m_common); + } + + void analyzePattern_impl() + { + m_info = InvalidInput; + m_analysisIsOk = false; + m_factorizationIsOk = false; + m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), + const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), + &m_common); + if (m_symbolic) { + m_isInitialized = true; + m_info = Success; + m_analysisIsOk = true; + m_extractedDataAreDirty = true; + } + } + + void factorize_impl() + { + + m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), + m_symbolic, &m_common, Scalar()); + + + m_info = m_numeric ? Success : NumericalIssue; + m_factorizationIsOk = m_numeric ? 1 : 0; + m_extractedDataAreDirty = true; + } + + template + void grab(const EigenBase &A) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A.derived()); + } + + void grab(const KLUMatrixRef &A) + { + if(&(A.derived()) != &mp_matrix) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A); + } + } + + // cached data to reduce reallocation, etc. + mutable LUMatrixType m_l; + + mutable LUMatrixType m_u; + mutable IntColVectorType m_p; + mutable IntRowVectorType m_q; + + KLUMatrixType m_dummy; + KLUMatrixRef mp_matrix; + + klu_numeric* m_numeric; + klu_symbolic* m_symbolic; + klu_common m_common; + mutable ComputationInfo m_info; + int m_factorizationIsOk; + int m_analysisIsOk; + mutable bool m_extractedDataAreDirty; + + private: + KLU(const KLU& ) { } +}; + + +template +void KLU::extractData() const +{ + if (m_extractedDataAreDirty) + { + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + +// // get size of the data +// int lnz, unz, rows, cols, nz_udiag; +// umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); +// +// // allocate data +// m_l.resize(rows,(std::min)(rows,cols)); +// m_l.resizeNonZeros(lnz); +// +// m_u.resize((std::min)(rows,cols),cols); +// m_u.resizeNonZeros(unz); +// +// m_p.resize(rows); +// m_q.resize(cols); +// +// // extract +// umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), +// m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), +// m_p.data(), m_q.data(), 0, 0, 0, m_numeric); +// +// m_extractedDataAreDirty = false; + } +} + +template +typename KLU::Scalar KLU::determinant() const +{ + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + return Scalar(); +} + +template +template +bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const +{ + Index rhsCols = b.cols(); + eigen_assert((BDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major rhs yet"); + eigen_assert((XDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major result yet"); + eigen_assert(b.derived().data() != x.derived().data() && " KLU does not support inplace solve"); + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); + + x = b; + int info = 0; + if (true/*(MatrixType::Flags&RowMajorBit) == 0*/) + { + info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + } + else + { + info = klu_tsolve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + } + + m_info = info!=0 ? Success : NumericalIssue; + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_KLUSUPPORT_H -- cgit v1.2.3 From e3912f5e63b0c08a0f592ad425e926a5d61e1b8a Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sun, 11 Mar 2018 10:01:44 -0400 Subject: MIsc. source and comment typos Found using `codespell` and `grep` from downstream FreeCAD --- Eigen/Core | 2 +- Eigen/src/Cholesky/LDLT.h | 2 +- Eigen/src/Cholesky/LLT.h | 2 +- Eigen/src/Core/AssignEvaluator.h | 4 ++-- Eigen/src/Core/DenseBase.h | 2 +- Eigen/src/Core/DenseStorage.h | 2 +- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/NoAlias.h | 4 ++-- Eigen/src/Core/PlainObjectBase.h | 2 +- Eigen/src/Core/Product.h | 2 +- Eigen/src/Core/Transpositions.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++-- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- Eigen/src/Core/products/GeneralMatrixVector.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 2 +- Eigen/src/Core/products/SelfadjointMatrixVector.h | 2 +- Eigen/src/Core/util/Macros.h | 4 ++-- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/Core/util/Meta.h | 2 +- Eigen/src/Eigenvalues/ComplexEigenSolver.h | 2 +- Eigen/src/Eigenvalues/ComplexSchur.h | 2 +- Eigen/src/Eigenvalues/EigenSolver.h | 2 +- Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h | 2 +- Eigen/src/Eigenvalues/RealQZ.h | 2 +- Eigen/src/Eigenvalues/RealSchur.h | 2 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 2 +- Eigen/src/Geometry/Scaling.h | 2 +- Eigen/src/IterativeLinearSolvers/IncompleteLUT.h | 4 ++-- Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h | 2 +- Eigen/src/KLUSupport/KLUSupport.h | 2 +- Eigen/src/LU/FullPivLU.h | 2 +- Eigen/src/LU/PartialPivLU.h | 4 ++-- Eigen/src/OrderingMethods/Eigen_Colamd.h | 6 +++--- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +- Eigen/src/PardisoSupport/PardisoSupport.h | 2 +- Eigen/src/QR/ColPivHouseholderQR.h | 2 +- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 2 +- Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 2 +- Eigen/src/SVD/BDCSVD.h | 2 +- Eigen/src/SVD/UpperBidiagonalization.h | 4 ++-- Eigen/src/SparseCholesky/SimplicialCholesky.h | 2 +- Eigen/src/SparseCore/SparseMatrix.h | 6 +++--- Eigen/src/SparseCore/SparseProduct.h | 2 +- Eigen/src/SparseCore/SparseVector.h | 2 +- Eigen/src/SparseLU/SparseLU.h | 2 +- Eigen/src/SparseLU/SparseLU_Memory.h | 2 +- Eigen/src/SparseLU/SparseLU_column_dfs.h | 4 ++-- Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 2 +- Eigen/src/SparseLU/SparseLU_panel_bmod.h | 2 +- Eigen/src/SuperLUSupport/SuperLUSupport.h | 2 +- Eigen/src/UmfPackSupport/UmfPackSupport.h | 2 +- Eigen/src/plugins/IndexedViewMethods.h | 2 +- bench/analyze-blocking-sizes.cpp | 2 +- bench/btl/README | 2 +- bench/btl/generic_bench/bench.hh | 2 +- bench/btl/generic_bench/utils/size_log.hh | 2 +- bench/btl/generic_bench/utils/xy_file.hh | 2 +- bench/btl/libs/ublas/ublas_interface.hh | 2 +- bench/eig33.cpp | 2 +- bench/spbench/spbenchsolver.cpp | 2 +- blas/f2c/ctbmv.c | 2 +- blas/f2c/dtbmv.c | 2 +- blas/f2c/stbmv.c | 2 +- blas/f2c/ztbmv.c | 2 +- blas/level1_impl.h | 2 +- blas/testing/cblat1.f | 2 +- blas/testing/dblat1.f | 2 +- blas/testing/sblat1.f | 2 +- blas/testing/zblat1.f | 2 +- cmake/EigenConfigureTesting.cmake | 4 ++-- cmake/EigenTesting.cmake | 10 +++++----- cmake/FindComputeCpp.cmake | 2 +- cmake/FindEigen3.cmake | 2 +- debug/msvc/eigen_autoexp_part.dat | 2 +- doc/Doxyfile.in | 2 +- doc/FunctionsTakingEigenTypes.dox | 4 ++-- doc/PreprocessorDirectives.dox | 2 +- doc/QuickStartGuide.dox | 2 +- doc/SparseQuickReference.dox | 2 +- doc/TemplateKeyword.dox | 2 +- doc/TopicLazyEvaluation.dox | 2 +- doc/TopicLinearAlgebraDecompositions.dox | 2 +- doc/TopicMultithreading.dox | 2 +- doc/TutorialMapClass.dox | 4 ++-- doc/TutorialSparse.dox | 4 ++-- doc/UnalignedArrayAssert.dox | 4 ++-- doc/UsingNVCC.dox | 2 +- doc/eigendoxy.css | 2 +- doc/special_examples/Tutorial_sparse_example.cpp | 2 +- lapack/CMakeLists.txt | 4 ++-- test/CMakeLists.txt | 2 +- test/bdcsvd.cpp | 2 +- test/eigensolver_complex.cpp | 2 +- test/geo_quaternion.cpp | 2 +- test/main.h | 2 +- test/packetmath.cpp | 2 +- 98 files changed, 122 insertions(+), 122 deletions(-) (limited to 'test/CMakeLists.txt') diff --git a/Eigen/Core b/Eigen/Core index 5a6dec8cc..a9bbfe276 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -360,7 +360,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 13a8f6d14..5be58377b 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -247,7 +247,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 814174d47..22e4be75d 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -180,7 +180,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index ebf5590de..362d905d2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; @@ -899,7 +899,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index fd933eed4..53b427b17 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -395,7 +395,7 @@ template class DenseBase * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 7958feeb9..9e58fbf88 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -61,7 +61,7 @@ struct plain_array #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 1b864a405..e981129b2 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -749,7 +749,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index e94c8ee96..570283d90 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -75,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1dc7e223a..6c0a42ec7 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -780,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type resize(size); } - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 676c48027..3d67d9489 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -116,7 +116,7 @@ class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 19c17bb4a..8798deca5 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -84,7 +84,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index ed80da36a..ab73fcf21 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -470,7 +470,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -496,7 +496,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index b3f1ea199..31bb896ca 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -434,7 +434,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD @@ -500,7 +500,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e1..4af2c6cae 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -242,7 +242,7 @@ Packet2d pexp(const Packet2d& _x) return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. +/* evaluation of 4 sines at once, using SSE2 intrinsics. The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce5..9072d0ff3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1523,7 +1523,7 @@ void gebp_kernel::half SResPacketHalf; @@ -1924,7 +1924,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 41d8242e1..b2a71bc6f 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -201,7 +201,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 3fd180e6c..67390f1d7 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at onces that allows to both reduce + * This algorithm processes 2 columns at once that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e351b7ad9..5872ade26 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -719,7 +719,7 @@ namespace Eigen { #error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. #endif -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated // They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 #if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES @@ -778,7 +778,7 @@ namespace Eigen { #endif // At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) // and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). // Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index c455f92a1..006b0bfba 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -703,7 +703,7 @@ template void swap(scoped_array &a,scoped_array &b) * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * -* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0fa818008..998b8921a 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -272,7 +272,7 @@ template<> struct numeric_limits #endif /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06a..081e918f1 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919f7..b8b3490c6 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index f205b185d..997bebe7b 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb8289..d0f9091be 100644 --- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd9..e2b37f40e 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -161,7 +161,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index f5c86041d..9e71f3040 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9ddd553f2..040f8d3bb 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -337,7 +337,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index 8d9acf252..df650fda6 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -128,7 +128,7 @@ public: /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defiend in MatrixBase and not as a friend function +// NOTE this operator is defined in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 338e6f10a..43bd8e8f6 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase::analyzePattern(const _MatrixType& amat) SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 7c2326eb7..bfeee71cd 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -275,7 +275,7 @@ public: const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value setted by setMaxIterations or, by default, + * It is either the value set by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h index a9e8633d9..d2633a935 100644 --- a/Eigen/src/KLUSupport/KLUSupport.h +++ b/Eigen/src/KLUSupport/KLUSupport.h @@ -106,7 +106,7 @@ class KLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index ec61086d5..50d1bb41b 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -48,7 +48,7 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index d43961887..bfcd2c95b 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -420,8 +420,8 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index da85b4d6e..67fcad3f7 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1493,7 +1493,7 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ + /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ } while (Col [c].shared2.order == COLAMD_EMPTY) ; @@ -1638,7 +1638,7 @@ static void detect_super_cols COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ + /* no gather scatter necessary */ if (*cp1++ != *cp2++) { break ; @@ -1688,7 +1688,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns + all available memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 160d8a523..37426877a 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 091c3970e..fb2ba04b4 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -140,7 +140,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 5270eaca2..ed47b05e3 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 13b61fcdb..880becb25 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -353,7 +353,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 953d57c9d..1a5c5254e 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -220,7 +220,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 06865a331..a24deb96a 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -62,7 +62,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 11ac847e1..0526ac931 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f6529..b9ca94bc3 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -101,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 323c2323b..8f77194b6 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -21,7 +21,7 @@ namespace Eigen { * This class implements a more versatile variants of the common \em compressed row/column storage format. * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index. * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra - * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero + * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero * can be done with limited memory reallocation and copies. * * A call to the function makeCompressed() turns the matrix into the standard \em compressed format @@ -503,7 +503,7 @@ class SparseMatrix } } - /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */ + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -986,7 +986,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa * * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather - * be explicitely stored into a std::vector for instance. + * be explicitly stored into a std::vector for instance. */ template template diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h index 4cbf68781..c495a7398 100644 --- a/Eigen/src/SparseCore/SparseProduct.h +++ b/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // supress numerical zeros (exact) + * C = (A*B).pruned(); // suppress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 19b0fbc9d..05779be68 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index f883ab383..383a203b4 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h index 4dc42e87b..349bfd585 100644 --- a/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accomodate more fill-ins + * Expand the existing storage to accommodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h index c98b30e32..5a2c941b4 100644 --- a/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // intialize xprune(jcol-1) + xprune(jcolm1) = istop; // initialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 95ba7413f..e37c2fe0d 100644 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // agressive vectorization and peeling + // aggressive vectorization and peeling for(Index i=0; i /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 9568cc1d5..c636f17ac 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -201,7 +201,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index a7ec63adf..9ad2d9aee 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -112,7 +112,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND #if EIGEN_HAS_STATIC_ARRAY_TEMPLATE -// The folowing three overloads are needed to handle raw Index[N] arrays. +// The following three overloads are needed to handle raw Index[N] arrays. template IndexedView::type> diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index d563a1d2d..6bc4aca3d 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -825,7 +825,7 @@ int main(int argc, char* argv[]) } for (int i = 1; i < argc; i++) { bool arg_handled = false; - // Step 1. Try to match action invokation names. + // Step 1. Try to match action invocation names. for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { if (!strcmp(argv[i], (*it)->invokation_name())) { if (!action) { diff --git a/bench/btl/README b/bench/btl/README index f3f5fb36f..ebed88960 100644 --- a/bench/btl/README +++ b/bench/btl/README @@ -36,7 +36,7 @@ For instance: You can also select a given set of actions defining the environment variable BTL_CONFIG this way: BTL_CONFIG="-a action1{:action2}*" ctest -V -An exemple: +An example: BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata" ctest -V -R eigen2 Finally, if bench results already exist (the bench*.dat files) then they merges by keeping the best for each matrix size. If you want to overwrite the previous ones you can simply add the "--overwrite" option: diff --git a/bench/btl/generic_bench/bench.hh b/bench/btl/generic_bench/bench.hh index 7b7b951b5..0732940d5 100644 --- a/bench/btl/generic_bench/bench.hh +++ b/bench/btl/generic_bench/bench.hh @@ -159,7 +159,7 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){ // bench(size_min,size_max,nb_point); - // Only for small problem size. Otherwize it will be too long + // Only for small problem size. Otherwise it will be too long // bench(size_min,size_max,nb_point); // bench(size_min,size_max,nb_point); diff --git a/bench/btl/generic_bench/utils/size_log.hh b/bench/btl/generic_bench/utils/size_log.hh index 13a3da7a8..68945e7cc 100644 --- a/bench/btl/generic_bench/utils/size_log.hh +++ b/bench/btl/generic_bench/utils/size_log.hh @@ -23,7 +23,7 @@ #include "math.h" // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element are int compatible. template void size_log(const int nb_point, const int size_min, const int size_max, Vector & X) diff --git a/bench/btl/generic_bench/utils/xy_file.hh b/bench/btl/generic_bench/utils/xy_file.hh index 4571bed8f..0492faf09 100644 --- a/bench/btl/generic_bench/utils/xy_file.hh +++ b/bench/btl/generic_bench/utils/xy_file.hh @@ -55,7 +55,7 @@ bool read_xy_file(const std::string & filename, std::vector & tab_sizes, // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element must have the << operator define using namespace std; diff --git a/bench/btl/libs/ublas/ublas_interface.hh b/bench/btl/libs/ublas/ublas_interface.hh index 95cad5195..f59b7cf2f 100644 --- a/bench/btl/libs/ublas/ublas_interface.hh +++ b/bench/btl/libs/ublas/ublas_interface.hh @@ -100,7 +100,7 @@ public : Y+=coef*X; } - // alias free assignements + // alias free assignments static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ X.assign(prod(A,B)); diff --git a/bench/eig33.cpp b/bench/eig33.cpp index 47947a9be..f003d8a53 100644 --- a/bench/eig33.cpp +++ b/bench/eig33.cpp @@ -101,7 +101,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) computeRoots(scaledMat,evals); // compute the eigen vectors - // **here we assume 3 differents eigenvalues** + // **here we assume 3 different eigenvalues** // "optimized version" which appears to be slower with gcc! // Vector base; diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp index 4acd0039c..2a7351124 100644 --- a/bench/spbench/spbenchsolver.cpp +++ b/bench/spbench/spbenchsolver.cpp @@ -54,7 +54,7 @@ int main(int argc, char ** args) statbuf.close(); } else - std::cerr << "Unable to open the provided file for writting... \n"; + std::cerr << "Unable to open the provided file for writing... \n"; } // Get the maximum number of iterations and the tolerance diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c index 790fd581f..a6e0dae80 100644 --- a/blas/f2c/ctbmv.c +++ b/blas/f2c/ctbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c index fdf73ebb5..aa67d19da 100644 --- a/blas/f2c/dtbmv.c +++ b/blas/f2c/dtbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c index fcf9ce336..b5a68b545 100644 --- a/blas/f2c/stbmv.c +++ b/blas/f2c/stbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c index 4cdcd7f88..3bf0beb01 100644 --- a/blas/f2c/ztbmv.c +++ b/blas/f2c/ztbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/level1_impl.h b/blas/level1_impl.h index f857bfa20..6e7f8c976 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -33,7 +33,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - // be carefull, *incx==0 is allowed !! + // be careful, *incx==0 is allowed !! if(*incx==1 && *incy==1) make_vector(y,*n) = make_vector(x,*n); else diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f index 8ca67fb19..73015f5a9 100644 --- a/blas/testing/cblat1.f +++ b/blas/testing/cblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/dblat1.f b/blas/testing/dblat1.f index 30691f9bf..03d9f1345 100644 --- a/blas/testing/dblat1.f +++ b/blas/testing/dblat1.f @@ -990,7 +990,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/sblat1.f b/blas/testing/sblat1.f index 6657c2693..4d43d9b48 100644 --- a/blas/testing/sblat1.f +++ b/blas/testing/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f index d30112c63..c00b67dc8 100644 --- a/blas/testing/zblat1.f +++ b/blas/testing/zblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index afc24b5e9..a2a4f54b9 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -20,7 +20,7 @@ include(CTest) set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests") # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests. -# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target. +# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target. # At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable. file(READ "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE) # try to grab the default flags @@ -39,7 +39,7 @@ ei_init_testing() # configure Eigen related testing options option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF) -option(EIGEN_DEBUG_ASSERTS "Enable advanced debuging of assertions" OFF) +option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF) if(CMAKE_COMPILER_IS_GNUCXX) option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 4a34ddef5..16d6d279f 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -247,7 +247,7 @@ endmacro(ei_add_test_internal_sycl) # # If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables # test__ -# where N runs from 1 to the greatest occurence found in the source file. Each of these +# where N runs from 1 to the greatest occurrence found in the source file. Each of these # executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests # into smaller executables. # @@ -269,8 +269,8 @@ macro(ei_add_test testname) file(READ "${filename}" test_source) set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) add_custom_target(${testname}) @@ -303,8 +303,8 @@ macro(ei_add_test_sycl testname) file(READ "${filename}" test_source) set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) add_custom_target(${testname}) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index e61dedc46..29f2a5007 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -243,7 +243,7 @@ endfunction() ####################### # # Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. +# target and sets a dependency on that new command. # # targetName : Name of the target to add a SYCL to. # binaryDir : Intermediate directory to output the integration header. diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake index 657440ba5..52efb4e15 100644 --- a/cmake/FindEigen3.cmake +++ b/cmake/FindEigen3.cmake @@ -15,7 +15,7 @@ # Eigen3::Eigen - The header-only Eigen library # # This module reads hints about search locations from -# the following enviroment variables: +# the following environment variables: # # EIGEN3_ROOT # EIGEN3_ROOT_DIR diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat index 07aa43739..35ef5807c 100644 --- a/debug/msvc/eigen_autoexp_part.dat +++ b/debug/msvc/eigen_autoexp_part.dat @@ -14,7 +14,7 @@ ; * - Eigen::Matrix<*,-1,+,*,*,*> ; * - Eigen::Matrix<*,+,+,*,*,*> ; * -; * Matrices are displayed properly independantly of the memory +; * Matrices are displayed properly independently of the memory ; * alignment (RowMajor vs. ColMajor). ; * ; * This file is distributed WITHOUT ANY WARRANTY. Please ensure diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 2109978fe..49b9fba39 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1764,7 +1764,7 @@ UML_LOOK = YES # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more -# managable. Set this to 0 for no limit. Note that the threshold may be +# manageable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index 152dda47d..e054714f9 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -133,7 +133,7 @@ In this special case, the example is fine and will be working because both param \section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail? -Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const paramter which allows us to store the result. A first naive implementation might look as follows. +Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows. \code // Note: This code is flawed! void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C) @@ -176,7 +176,7 @@ The implementation above does now not only work with temporary expressions but i \section TopicResizingInGenericImplementations How to resize matrices in generic implementations? -One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the follwing code to work +One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work \code MatrixXf x = MatrixXf::Random(100,3); MatrixXf y = MatrixXf::Random(100,3); diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index b6d08c700..b49f7d3cf 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -51,7 +51,7 @@ are doing. \section TopicPreprocessorDirectivesCppVersion C++ standard features -By default, %Eigen strive to automatically detect and enable langage features at compile-time based on +By default, %Eigen strive to automatically detect and enable language features at compile-time based on the information provided by the compiler. - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER. diff --git a/doc/QuickStartGuide.dox b/doc/QuickStartGuide.dox index ea32c3b3d..23bb2981b 100644 --- a/doc/QuickStartGuide.dox +++ b/doc/QuickStartGuide.dox @@ -68,7 +68,7 @@ The output is as follows: The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetics. -The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left unitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: +The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: \f[ v = diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index a25622e80..81a73eec2 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -80,7 +80,7 @@ sm1.setZero(); \section SparseBasicInfos Matrix properties -Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some informations from the matrix. +Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix.
\code diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox index b84cfdae9..fbf2c7081 100644 --- a/doc/TemplateKeyword.dox +++ b/doc/TemplateKeyword.dox @@ -76,7 +76,7 @@ point where the template is defined, without knowing the actual value of the tem and \c Derived2 in the example). That means that the compiler cannot know that dst.triangularView is a member template and that the following < symbol is part of the delimiter for the template parameter. Another possibility would be that dst.triangularView is a member variable with the < -symbol refering to the operator<() function. In fact, the compiler should choose the second +symbol referring to the operator<() function. In fact, the compiler should choose the second possibility, according to the standard. If dst.triangularView is a member template (as in our case), the programmer should specify this explicitly with the \c template keyword and write dst.template triangularView. diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox index 101ef8c72..b7820e3e6 100644 --- a/doc/TopicLazyEvaluation.dox +++ b/doc/TopicLazyEvaluation.dox @@ -58,7 +58,7 @@ the product matrix3 * matrix4 gets evaluated immediately into a tempora \code matrix1 = matrix2 * (matrix3 + matrix4); \endcode -Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum everytime, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. +Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. */ diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 491470627..991f964cc 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -248,7 +248,7 @@ To get an overview of the true relative speed of the different decomposition, ch
Blocking
Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.
Implicit Multi Threading (MT)
-
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.
+
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.
Explicit Multi Threading (MT)
Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.
Meta-unroller
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox index 47c9b261f..bc394f484 100644 --- a/doc/TopicMultithreading.dox +++ b/doc/TopicMultithreading.dox @@ -47,7 +47,7 @@ int main(int argc, char** argv) \warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature. -In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section. +In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallelization as detailed in the previous section. */ diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox index f8fb0fd2f..caa2539d8 100644 --- a/doc/TutorialMapClass.dox +++ b/doc/TutorialMapClass.dox @@ -29,9 +29,9 @@ Map mi(pi); \endcode where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type. -Note that Map does not have a default constructor; you \em must pass a pointer to intialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). +Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). -Map is flexible enough to accomodate a variety of different data representations. There are two other (optional) template parameters: +Map is flexible enough to accommodate a variety of different data representations. There are two other (optional) template parameters: \code Map mat(rows,cols); // default is column major 2: mat.reserve(VectorXi::Constant(cols,6)); diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 0f7022973..8676faa1b 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -117,8 +117,8 @@ It doesn't disable 16-byte alignment, because that would mean that vectorized an \section checkmycode How can I check my code is safe regarding alignment issues? -Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). -Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page. +Unfortunately, there is no possibility in C++ to detect any of the aforementioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). +Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page. Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default. The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink. diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox index 9bcdf0bfc..36beb2ddd 100644 --- a/doc/UsingNVCC.dox +++ b/doc/UsingNVCC.dox @@ -5,7 +5,7 @@ namespace Eigen { Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code. This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header. -This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only. +This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only. However, in both cases, host's SIMD vectorization has to be disabled in .cu files. It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files. diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css index 6ce2b839b..b99d7914a 100644 --- a/doc/eigendoxy.css +++ b/doc/eigendoxy.css @@ -93,7 +93,7 @@ table th.inter { border-color: #cccccc; } -/** class for exemple / output tables **/ +/** class for example / output tables **/ table.example { } diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp index 830e196ea..89937b411 100644 --- a/doc/special_examples/Tutorial_sparse_example.cpp +++ b/doc/special_examples/Tutorial_sparse_example.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) assert(argc==2); int n = 300; // size of the image - int m = n*n; // number of unknows (=number of pixels) + int m = n*n; // number of unknowns (=number of pixels) // Assembly: std::vector coefficients; // list of non-zeros coefficients diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 9883d4c72..52f18edfc 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -35,7 +35,7 @@ set(EigenLapack_SRCS ${EigenLapack_SRCS} second_NONE.f dsecnd_NONE.f ) -option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enbale the Lapack unit tests") +option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests") if(EIGEN_ENABLE_LAPACK_TESTS) @@ -59,7 +59,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS) message(STATUS "Setup lapack reference and lapack unit tests") execute_process(COMMAND tar xzf "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) else() - message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests wont be enabled") + message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled") set(EIGEN_ENABLE_LAPACK_TESTS false) endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8bd086ce3..8bcf3f7c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ # generate split test header file only if it does not yet exist -# in order to prevent a rebuild everytime cmake is configured +# in order to prevent a rebuild every time cmake is configured if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") foreach(i RANGE 1 999) diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index 6c7b09696..109218766 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -104,7 +104,7 @@ void test_bdcsvd() CALL_SUBTEST_7( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - // Disbaled because not supported by BDCSVD + // Disabled because not supported by BDCSVD // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index 293b1b265..03d5774ef 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -47,7 +47,7 @@ template bool find_pivot(typename MatrixType::Scalar tol, M return false; } -/* Check that two column vectors are approximately equal upto permutations. +/* Check that two column vectors are approximately equal up to permutations. * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(), * however this strategy is numerically inacurate because of numerical cancellation issues. */ diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 8ee8fdb27..5854d39c5 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -241,7 +241,7 @@ template void mapQuaternion(void){ const MQuaternionUA& cmq3(mq3); VERIFY( &cmq3.x() == &mq3.x() ); // FIXME the following should be ok. The problem is that currently the LValueBit flag - // is used to determine wether we can return a coeff by reference or not, which is not enough for Map. + // is used to determine whether we can return a coeff by reference or not, which is not enough for Map. //const MCQuaternionUA& cmcq3(mcq3); //VERIFY( &cmcq3.x() == &mcq3.x() ); } diff --git a/test/main.h b/test/main.h index 6079cbd06..14f4e3e7a 100644 --- a/test/main.h +++ b/test/main.h @@ -183,7 +183,7 @@ namespace Eigen }; } // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while - // one should have been, then the list of excecuted assertions is printed out. + // one should have been, then the list of executed assertions is printed out. // // EIGEN_DEBUG_ASSERTS is not enabled by default as it // significantly increases the compilation time diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 08b360340..3c11df7e8 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -28,7 +28,7 @@ template T negate(const T& x) { return -x; } } } -// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU. +// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU. template EIGEN_DONT_INLINE bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits::Real& refvalue) { -- cgit v1.2.3 From 2cbb00b18ee3d02d3b747bcb1775d7f851e46d7e Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 19:14:25 +0200 Subject: No need to make noise, if KLU is found --- test/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) (limited to 'test/CMakeLists.txt') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8bcf3f7c5..f8deafc07 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -309,7 +309,6 @@ if(UMFPACK_FOUND) endif() if(KLU_FOUND OR SuiteSparse_FOUND) - message("ADDING KLU TEST") ei_add_test(klu_support "" "${KLU_ALL_LIBS}") endif() -- cgit v1.2.3 From 750af063629cd366f6d01985b37a21ebc37a5af3 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 22 May 2018 21:04:32 +0200 Subject: Add an option to test with external BLAS library --- cmake/EigenTesting.cmake | 4 ++++ test/CMakeLists.txt | 9 +++++++++ 2 files changed, 13 insertions(+) (limited to 'test/CMakeLists.txt') diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 16d6d279f..7d2d63722 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -679,6 +679,10 @@ macro(ei_set_build_string) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS}) endif() + if(EIGEN_TEST_EXTERNAL_BLAS) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas) + endif() + ei_is_64bit_env(IS_64BIT_ENV) if(NOT IS_64BIT_ENV) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f8deafc07..073effd81 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,6 +30,15 @@ if(NOT EIGEN_Fortran_COMPILER_WORKS) find_package(LAPACK QUIET) endif() +# TODO do the same for EXTERNAL_LAPACK +option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF) +if(EIGEN_TEST_EXTERNAL_BLAS) + find_package(BLAS REQUIRED) + message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}") + add_definitions("-DEIGEN_USE_BLAS") # is adding ${BLAS_COMPILER_FLAGS} necessary? + list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}") +endif(EIGEN_TEST_EXTERNAL_BLAS) + # configure blas/lapack (use Eigen's ones) set(EIGEN_BLAS_LIBRARIES eigen_blas) set(EIGEN_LAPACK_LIBRARIES eigen_lapack) -- cgit v1.2.3 From eef4b7bd8768a79078ba6d4adc9f85b3f5937e39 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 20:49:06 +0200 Subject: Fix handling of path names containing spaces and the likes. --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/CMakeLists.txt') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 073effd81..12c12b06d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,7 +47,7 @@ set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse mat if(EIGEN_TEST_MATRIX_DIR) if(NOT WIN32) message(STATUS "Test realworld sparse matrices: ${EIGEN_TEST_MATRIX_DIR}") - add_definitions( -DTEST_REAL_CASES="${EIGEN_TEST_MATRIX_DIR}" ) + add_definitions( -DTEST_REAL_CASES=${EIGEN_TEST_MATRIX_DIR} ) else(NOT WIN32) message(STATUS "REAL CASES CAN NOT BE CURRENTLY TESTED ON WIN32") endif(NOT WIN32) -- cgit v1.2.3 From 999b552c16fe48ba6ca2ea0fe8d8788775f3c58d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 20:49:25 +0200 Subject: Search for sequential Pastix. --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/CMakeLists.txt') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 12c12b06d..e1eef086e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -100,7 +100,7 @@ else() endif() -find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +find_package(PASTIX QUIET COMPONENTS METIS SEQ) # check that the PASTIX found is a version without MPI find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS NAMES pastix_nompi.h -- cgit v1.2.3 From 8fbd47052bcafea612b8ae2841c1de5db738f042 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 6 Jun 2018 10:12:58 -0400 Subject: Adding support for using Eigen in HIP kernels. This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests. --- Eigen/Core | 69 +- Eigen/src/Core/GenericPacketMath.h | 8 +- Eigen/src/Core/MathFunctions.h | 61 +- Eigen/src/Core/ProductEvaluators.h | 6 + Eigen/src/Core/arch/HIP/hcc/Half.h | 705 +++++++++ Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h | 1019 +++++++++++++ Eigen/src/Core/arch/HIP/hcc/TypeCasting.h | 212 +++ Eigen/src/Core/arch/HIP/hcc/math_constants.h | 23 + Eigen/src/Core/functors/BinaryFunctors.h | 6 + Eigen/src/Core/util/BlasUtil.h | 5 +- Eigen/src/Core/util/Macros.h | 5 +- Eigen/src/Core/util/Memory.h | 38 + Eigen/src/Core/util/Meta.h | 52 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 1 + Eigen/src/SVD/BDCSVD.h | 2 +- cmake/EigenTesting.cmake | 9 +- test/CMakeLists.txt | 42 + test/hip_basic.cu | 172 +++ test/hip_common.h | 103 ++ test/main.h | 22 +- unsupported/Eigen/CXX11/Tensor | 40 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 10 +- .../CXX11/src/Tensor/TensorContractionBlocking.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorContractionHip.h | 1521 ++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorConvolutionHip.h | 1119 ++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorDeviceHip.h | 352 +++++ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 6 +- unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 24 +- .../Eigen/CXX11/src/Tensor/TensorReductionHip.h | 815 +++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 9 +- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 12 + unsupported/Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 4 +- unsupported/test/CMakeLists.txt | 52 + unsupported/test/cxx11_tensor_argmax_hip.cu | 251 ++++ unsupported/test/cxx11_tensor_cast_float16_hip.cu | 79 + unsupported/test/cxx11_tensor_contract_hip.cu | 215 +++ unsupported/test/cxx11_tensor_device_hip.cu | 389 +++++ unsupported/test/cxx11_tensor_hip.cu | 1296 +++++++++++++++++ unsupported/test/cxx11_tensor_of_float16_hip.cu | 498 +++++++ unsupported/test/cxx11_tensor_random_hip.cu | 85 ++ unsupported/test/cxx11_tensor_reduction_hip.cu | 154 ++ unsupported/test/cxx11_tensor_scan_hip.cu | 76 + 50 files changed, 9527 insertions(+), 94 deletions(-) create mode 100644 Eigen/src/Core/arch/HIP/hcc/Half.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/TypeCasting.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/math_constants.h create mode 100644 test/hip_basic.cu create mode 100644 test/hip_common.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h create mode 100644 unsupported/test/cxx11_tensor_argmax_hip.cu create mode 100644 unsupported/test/cxx11_tensor_cast_float16_hip.cu create mode 100644 unsupported/test/cxx11_tensor_contract_hip.cu create mode 100644 unsupported/test/cxx11_tensor_device_hip.cu create mode 100644 unsupported/test/cxx11_tensor_hip.cu create mode 100644 unsupported/test/cxx11_tensor_of_float16_hip.cu create mode 100644 unsupported/test/cxx11_tensor_random_hip.cu create mode 100644 unsupported/test/cxx11_tensor_reduction_hip.cu create mode 100644 unsupported/test/cxx11_tensor_scan_hip.cu (limited to 'test/CMakeLists.txt') diff --git a/Eigen/Core b/Eigen/Core index f6bc18a08..c72d5468a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,17 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDACC, but for HIP + #define EIGEN_HIPCC __HIPCC__ +#endif + +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." +#endif + // Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) @@ -32,8 +43,8 @@ #endif // Handle NVCC/CUDA/SYCL -#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC) + // Do not try asserts on CUDA, HIP and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG #endif @@ -57,6 +68,26 @@ // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side #include + + #elif defined(EIGEN_HIPCC) + // Do not try to vectorize on HIP + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + + #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + // Note this check needs to come after we include hip_runtime.h since + // hip_runtime.h includes hip_common.h which in turn has the define + // for __HIP_DEVICE_COMPILE__ + #endif + #else #define EIGEN_DEVICE_FUNC #endif @@ -68,16 +99,16 @@ #define EIGEN_DONT_VECTORIZE #endif -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) +// When compiling CUDA device code with NVCC, or HIP device code with HIPCC +// pull in math functions from the global namespace. In host mode, and when +// device doee with clang, use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__)) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_EXCEPTIONS #endif @@ -270,6 +301,17 @@ #include #endif +#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_HAS_HIP_FP16 + #include + #define HIP_PATCH_WITH_NEW_FP16 18215 + #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16) + #define EIGEN_HAS_OLD_HIP_FP16 + // Old HIP implementation does not have a explicit typedef for "half2" + typedef __half2 half2; + #endif +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -390,7 +432,6 @@ using std::ptrdiff_t; #include "src/Core/util/IntegralConstant.h" #include "src/Core/util/SymbolicIndex.h" - #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" @@ -434,9 +475,15 @@ using std::ptrdiff_t; #endif // Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_USE_HIP + #include "src/Core/arch/HIP/hcc/Half.h" + #include "src/Core/arch/HIP/hcc/PacketMathHalf.h" + #include "src/Core/arch/HIP/hcc/TypeCasting.h" +#else + #include "src/Core/arch/CUDA/Half.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" + #include "src/Core/arch/CUDA/TypeCasting.h" +#endif #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 888a3f7ea..0903c3a6e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,11 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) +template + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + inline void prefetch(const Scalar* addr) { #ifdef EIGEN_CUDA_ARCH #if defined(__LP64__) @@ -528,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 05462c5e1..6beef5def 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct imag_impl > { @@ -260,7 +260,7 @@ struct conj_default_impl template struct conj_impl : conj_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct conj_impl > { @@ -435,7 +435,12 @@ struct round_retval struct arg_impl { static inline Scalar run(const Scalar& x) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else EIGEN_USING_STD_MATH(arg); + #endif return arg(x); } }; @@ -768,7 +773,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isfinite(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -783,7 +790,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isinf(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -798,7 +807,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isnan(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -864,7 +875,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1078,7 +1089,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1136,7 +1147,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1157,7 +1168,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1215,7 +1226,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1243,7 +1254,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1273,7 +1284,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1309,7 +1320,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1329,7 +1340,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1349,7 +1360,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1369,7 +1380,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1400,7 +1411,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1431,7 +1442,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1462,7 +1473,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1483,7 +1494,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1503,7 +1514,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1521,12 +1532,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1546,7 +1557,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index fb637191d..cc75fbce3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,6 +137,9 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { @@ -390,6 +393,9 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); diff --git a/Eigen/src/Core/arch/HIP/hcc/Half.h b/Eigen/src/Core/arch/HIP/hcc/Half.h new file mode 100644 index 000000000..2ce8a412c --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/Half.h @@ -0,0 +1,705 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting from HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_HIP_H +#define EIGEN_HALF_HIP_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +namespace Eigen { + +struct half; + +namespace half_impl { + +#if !defined(EIGEN_HAS_HIP_FP16) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + unsigned short x; +}; +#elif defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + #if !defined(EIGEN_HAS_HIP_FP16) || defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif + + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif + + explicit EIGEN_DEVICE_FUNC half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(const T& val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return half_impl::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast(half_impl::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs mostly working through conversion to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + __half tmp_ff = __float2half(ff); + #if defined(EIGEN_HAS_OLD_HIP_FP16) + __half_raw h; + h.data = tmp_ff; + return h; + #else + return *(__half_raw*)&tmp_ff; + #endif + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7c01); + } +}; + +} // end namespace Eigen + +// C-like standard mathematical functions and trancendentals. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return Eigen::half(hlog(a)); +#else + return Eigen::half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_HIP_FP16) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + // FIXME + //return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + return var; +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(EIGEN_HAS_HIP_FP16) && \ + defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) && defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + // FIXME + //return Eigen::half_impl::raw_uint16_to_half( + // __ldg(reinterpret_cast(ptr))); + return *ptr; +} +#endif + + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +} // namespace Eigen +} // namespace numext +#endif + +#endif // EIGEN_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h new file mode 100644 index 000000000..29c3f4671 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h @@ -0,0 +1,1019 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_HIP_H +#define EIGEN_PACKET_MATH_HALF_HIP_H + + +namespace Eigen { +namespace internal { + +// Most of the following operations require arch >= 3.0 +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> + __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif +} + +template<> +__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return __low2half(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + __half x = __ushort_as_half(__half_as_ushort(__low2half(a)) & 0x7FFF); + __half y = __ushort_as_half(__half_as_ushort(__high2half(a)) & 0x7FFF); + return __halves2half2(x, y); +} + + +__device__ EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { + return __hmul2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#elif defined EIGEN_VECTORIZE_AVX512 + +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; + +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + _mm256_store_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + _mm256_storeu_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +#elif defined EIGEN_VECTORIZE_AVX + +typedef struct { + __m128i x; +} Packet8h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + +} +} + +#endif // EIGEN_PACKET_MATH_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h new file mode 100644 index 000000000..915266a9d --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_HIP_H +#define EIGEN_TYPE_CASTING_HIP_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#elif defined EIGEN_VECTORIZE_AVX512 +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} + +#elif defined EIGEN_VECTORIZE_AVX + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 000000000..25375a0a4 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8ca..e269140bd 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,6 +436,9 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,6 +458,9 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index b1791fb3a..a4cde6d95 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -163,7 +163,10 @@ class BlasLinearMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9c68ecb7d..c6e27f6af 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1008,9 +1008,12 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef EIGEN_CUDA_ARCH +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 53300c388..87fcc30f5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,20 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else ::operator new(huge); + #endif #endif } @@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + result = aligned_malloc(size); + #else result = std::malloc(size); + #endif + #if EIGEN_DEFAULT_ALIGN_BYTES==16 eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); #endif @@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif + #else handmade_aligned_free(ptr); #endif @@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + void *result = aligned_malloc(size); + #else void *result = std::malloc(size); + #endif + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +262,11 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -493,7 +527,11 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::memcpy(target, start, size); + #else std::memcpy(target, start, size); + #endif } }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6e5af35c0..ca6fa6ce9 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -16,6 +16,12 @@ #include #endif +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + + #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include #endif @@ -175,7 +181,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -197,13 +203,31 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { @@ -214,9 +238,21 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { @@ -529,13 +565,13 @@ template struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) using internal::device::numeric_limits; #else using std::numeric_limits; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 040f8d3bb..bf28edc0e 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -354,6 +354,7 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: + EIGEN_DEVICE_FUNC static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index a24deb96a..e977b9623 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 7d2d63722..f818ae840 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,7 +19,9 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - if(EIGEN_TEST_CUDA_CLANG) + if(EIGEN_TEST_HIP) + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP") + elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) if(CUDA_64_BIT_DEVICE_CODE) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") @@ -491,6 +493,11 @@ macro(ei_testing_print_summary) else() message(STATUS "CUDA: OFF") endif() + if(EIGEN_TEST_HIP) + message(STATUS "HIP: ON (using hipcc)") + else() + message(STATUS "HIP: OFF") + endif() endif() # vectorization / alignment options diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1eef086e..4a5c1d36d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -407,6 +407,48 @@ endif(CUDA_FOUND) endif(EIGEN_TEST_CUDA) +# HIP unit tests +option(EIGEN_TEST_HIP "Add HIP support." OFF) +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if (${HIP_PLATFORM} STREQUAL "hcc") + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(hip_basic) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif (${HIP_PLATFORM} STREQUAL "nvcc") + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif(HIP_FOUND) + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif(EIGEN_TEST_HIP) + + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) diff --git a/test/hip_basic.cu b/test/hip_basic.cu new file mode 100644 index 000000000..2e1bf94a4 --- /dev/null +++ b/test/hip_basic.cu @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015-2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// workaround issue between gcc >= 4.7 and cuda 5.5 +#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7) + #undef _GLIBCXX_ATOMIC_BUILTINS + #undef _GLIBCXX_USE_INT128 +#endif + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC hip_basic +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include + +#include "main.h" +#include "hip_common.h" + +// Check that dense modules can be properly parsed by hipcc +#include + +// struct Foo{ +// EIGEN_DEVICE_FUNC +// void operator()(int i, const float* mats, float* vecs) const { +// using namespace Eigen; +// // Matrix3f M(data); +// // Vector3f x(data+9); +// // Map(data+9) = M.inverse() * x; +// Matrix3f M(mats+i/16); +// Vector3f x(vecs+i*3); +// // using std::min; +// // using std::sqrt; +// Map(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x(); +// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum(); +// } +// }; + +template +struct coeff_wise { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + T x2(in+i+1); + T x3(in+i+2); + Map res(out+i*T::MaxSizeAtCompileTime); + + res.array() += (in[0] * x1 + x2).array() * x3.array(); + } +}; + +template +struct replicate { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + int step = x1.size() * 4; + int stride = 3 * step; + + typedef Map > MapType; + MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2); + MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3); + MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3); + } +}; + +template +struct redux { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + int N = 10; + T x1(in+i); + out[i*N+0] = x1.minCoeff(); + out[i*N+1] = x1.maxCoeff(); + out[i*N+2] = x1.sum(); + out[i*N+3] = x1.prod(); + out[i*N+4] = x1.matrix().squaredNorm(); + out[i*N+5] = x1.matrix().norm(); + out[i*N+6] = x1.colwise().sum().maxCoeff(); + out[i*N+7] = x1.rowwise().maxCoeff().sum(); + out[i*N+8] = x1.matrix().colwise().squaredNorm().sum(); + } +}; + +template +struct prod_test { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + typedef Matrix T3; + T1 x1(in+i); + T2 x2(in+i+1); + Map res(out+i*T3::MaxSizeAtCompileTime); + res += in[i] * x1 * x2; + } +}; + +template +struct diagonal { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + T1 x1(in+i); + Map res(out+i*T2::MaxSizeAtCompileTime); + res += x1.diagonal(); + } +}; + +template +struct eigenvalues { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix Vec; + T M(in+i); + Map res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver eig; + eig.computeDirect(M); + res = eig.eigenvalues(); + } +}; + +void test_hip_basic() +{ + ei_test_init_hip(); + + int nthreads = 100; + Eigen::VectorXf in, out; + + #ifndef __HIP_DEVICE_COMPILE__ + int data_size = nthreads * 512; + in.setRandom(data_size); + out.setRandom(data_size); + #endif + + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise(), nthreads, in, out) ); + + // FIXME compile fails when we uncomment the followig two tests + // CALL_SUBTEST( run_and_compare_to_hip(replicate(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(replicate(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(redux(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(redux(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(prod_test(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(prod_test(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(diagonal(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(diagonal(), nthreads, in, out) ); + + // FIXME : Runtime failure occurs when we uncomment the following two tests + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues(), nthreads, in, out) ); + +} diff --git a/test/hip_common.h b/test/hip_common.h new file mode 100644 index 000000000..251585c52 --- /dev/null +++ b/test/hip_common.h @@ -0,0 +1,103 @@ + +#ifndef EIGEN_TEST_HIP_COMMON_H +#define EIGEN_TEST_HIP_COMMON_H + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include + +#ifndef __HIPCC__ +dim3 threadIdx, blockDim, blockIdx; +#endif + +template +void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + for(int i=0; i +__global__ __attribute__((used)) +void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) +{ + int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x; + if(i +void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + typename Input::Scalar* d_in; + typename Output::Scalar* d_out; + std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); + std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); + + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice); + + // Simple and non-optimal 1D mapping assuming n is not too large + // That's only for unit testing! + dim3 Blocks(128); + dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); + + hipDeviceSynchronize(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel::type, + typename std::decay::type>), + dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); + hipDeviceSynchronize(); + + // check inputs have not been modified + hipMemcpy(const_cast(in.data()), d_in, in_bytes, hipMemcpyDeviceToHost); + hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost); + + hipFree(d_in); + hipFree(d_out); +} + + +template +void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + Input in_ref, in_hip; + Output out_ref, out_hip; + #ifndef __HIP_DEVICE_COMPILE__ + in_ref = in_hip = in; + out_ref = out_hip = out; + #endif + run_on_cpu (ker, n, in_ref, out_ref); + run_on_hip(ker, n, in_hip, out_hip); + #ifndef __HIP_DEVICE_COMPILE__ + VERIFY_IS_APPROX(in_ref, in_hip); + VERIFY_IS_APPROX(out_ref, out_hip); + #endif +} + + +void ei_test_init_hip() +{ + int device = 0; + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, device); + std::cout << "HIP device info:\n"; + std::cout << " name: " << deviceProp.name << "\n"; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; + std::cout << " warpSize: " << deviceProp.warpSize << "\n"; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; + std::cout << " clockRate: " << deviceProp.clockRate << "\n"; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; + std::cout << " computeMode: " << deviceProp.computeMode << "\n"; +} + +#endif // EIGEN_TEST_HIP_COMMON_H diff --git a/test/main.h b/test/main.h index 0fcd6cb76..79717a532 100644 --- a/test/main.h +++ b/test/main.h @@ -67,11 +67,17 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses -#define isnan(X) please_protect_your_isnan_with_parentheses -#define isinf(X) please_protect_your_isinf_with_parentheses -#define isfinite(X) please_protect_your_isfinite_with_parentheses +#if !defined(__HIPCC__) + // HIP headers include the header which contains not-parenthesized + // calls to "max", triggering the following check and causing the compile to fail + // so disabling the following checks for HIP + #define min(A,B) please_protect_your_min_with_parentheses + #define max(A,B) please_protect_your_max_with_parentheses + #define isnan(X) please_protect_your_isnan_with_parentheses + #define isinf(X) please_protect_your_isinf_with_parentheses + #define isfinite(X) please_protect_your_isfinite_with_parentheses +#endif + #ifdef M_PI #undef M_PI #endif @@ -154,7 +160,7 @@ namespace Eigen #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) #define EIGEN_EXCEPTIONS #endif @@ -233,7 +239,7 @@ namespace Eigen } #endif //EIGEN_EXCEPTIONS - #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS + #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 #define eigen_assert(a) \ if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ @@ -290,7 +296,7 @@ namespace Eigen std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; #endif - #if !defined(__CUDACC__) + #if !defined(__CUDACC__) && !defined(__HIPCC__) #define EIGEN_USE_CUSTOM_ASSERT #endif diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d243fe035..4b7c7d724 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif + #include + #if defined(EIGEN_USE_HIP) + #include + #else + #include + #endif + #if __cplusplus >= 201103L + #include + #include + #endif #endif #include "src/Tensor/TensorMacros.h" @@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorDeviceHip.h" +#else + #include "src/Tensor/TensorDeviceCuda.h" +#endif #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorReductionHip.h" +#else + #include "src/Tensor/TensorReductionCuda.h" +#endif #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorContractionHip.h" +#else + #include "src/Tensor/TensorContractionCuda.h" +#endif #include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorConvolutionHip.h" +#else + #include "src/Tensor/TensorConvolution.h" +#endif #include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorPatch.h" #include "src/Tensor/TensorImagePatch.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e72ddb4a9..979fcf4d9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) if (m_can_use_xsmm) { evalGemmXSMM(buffer); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index d34f9caee..4853dd37b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,10 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h new file mode 100644 index 000000000..7561846a3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h @@ -0,0 +1,1521 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in hipThreadIdx_x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z; + const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // hipThreadIdx_x: the vertical position in an 8x8 block + // hipThreadIdx_y: the vertical index of the 8x8 block in the grid + // hipThreadIdx_z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y]; + const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (hipThreadIdx_x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8); + + if (hipThreadIdx_x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0]; + Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1]; + Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2]; + Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3]; + Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4]; + Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5]; + Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6]; + Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7]; + + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j]; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +__launch_bounds__(512, 1) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + /*reg = lhs.template loadPacket(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + /*reg =lhs.template loadPacket(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+hipThreadIdx_x*4; + + for (Index k = 0; k < k_size; k += 16) { + //lhs_pf0 = internal::pset1(0); + //rhs_pf0 = internal::pset1(0); + lhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf0 = make_float4(0, 0, 0, 0); + + Index lhs_horiz = hipThreadIdx_y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(hipThreadIdx_x%4)*4; + Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((hipThreadIdx_x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((hipThreadIdx_x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x]; + float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x]; + + int start_feature = hipThreadIdx_y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = hipThreadIdx_y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + /*lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0);*/ + + + lhs_pf0 = make_float4(0, 0, 0, 0); + lhs_pf1 = make_float4(0, 0, 0, 0); + lhs_pf2 = make_float4(0, 0, 0, 0); + lhs_pf3 = make_float4(0, 0, 0, 0); + + rhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf1 = make_float4(0, 0, 0, 0); + + if (!CHECK_LHS_BOUNDARY) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+hipThreadIdx_x*4; + Index rhs_horiz0 = hipThreadIdx_y*2+base_n; + Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + + // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8. + int start_feature = (hipThreadIdx_y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (hipThreadIdx_y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + // We need to redefine this method to make hipcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setHipSharedMemConfig(hipSharedMemBankSizeEightByte); + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_HIPCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h new file mode 100644 index 000000000..ba9971050 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h @@ -0,0 +1,1119 @@ +//#include "hip/hip_runtime.h" +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array hipInputDimensions; + array hipOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + hipInputDimensions[index] = input_dims[indices[i]]; + hipOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + hipInputDimensions[written] = input_dims[i]; + hipOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_hipInputStrides[i] = + m_hipInputStrides[i - 1] * hipInputDimensions[i - 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_hipInputStrides[i] = + m_hipInputStrides[i + 1] * hipInputDimensions[i + 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_hipInputStrides; + array m_hipOutputStrides; +}; + + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = hipBlockIdx_y * hipBlockDim_y; + const int plane_stride = hipBlockDim_y * hipGridDim_y; + + for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_y * num_x_input; + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = hipBlockIdx_z * hipBlockDim_z; + const int plane_stride = hipBlockDim_z * hipGridDim_z; + + for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + // Load inputs to shared memory + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = hipBlockIdx_z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumHipMultiProcessors(); + const int hipWarpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(hipWarpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024/block_size.x, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[idxX], m_indices[idxY]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D, Index, InputDims>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, + numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 341889e88..e94e577fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,9 +35,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; #else // Running on a CUDA device return 32; @@ -45,7 +48,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +58,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,10 +68,14 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; #else // Running on a CUDA device return EIGEN_CUDA_ARCH / 100; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h new file mode 100644 index 000000000..c0e110987 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h @@ -0,0 +1,352 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H + +#if defined(EIGEN_HIPCC) +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#endif +#include //for sleep function + +namespace Eigen { + +static const int kHipScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const hipStream_t& stream() const = 0; + virtual const hipDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static hipDeviceProp_t* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if 0 && __cplusplus >= 201103L + static std::atomic first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + hipError_t status = hipGetDeviceCount(&num_devices); + if (status != hipSuccess) { + std::cerr << "Failed to get the number of HIP devices: " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + m_deviceProperties = new hipDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = hipGetDeviceProperties(&m_deviceProperties[i], i); + if (status != hipSuccess) { + std::cerr << "Failed to initialize HIP device #" + << i + << ": " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + } + +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00; + +class HipStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + hipGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + HipStreamDevice(const hipStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + hipGetDevice(&device_); + } else { + int num_devices; + hipError_t err = hipGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~HipStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const hipStream_t& stream() const { return *stream_; } + const hipDeviceProp_t& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + void* result; + err = hipMalloc(&result, num_bytes); + assert(err == hipSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(buffer != NULL); + err = hipFree(buffer); + assert(err == hipSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kHipScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kHipScratchSize; + semaphore_ = reinterpret_cast(scratch); + //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int)); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + return semaphore_; + } + + private: + const hipStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const hipStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream()); + hipError_t err = hipMemset(buffer, c, n); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip devices. + return firstLevelCacheSize(); + } + +// FIXME - this will move into HIP +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#undef assert +#define assert(COND) +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(EIGEN_HIPCC) && \ + !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipStreamSynchronize(stream_->stream()); + if (err != hipSuccess) { + std::cerr << "Error detected in HIP stream: " + << hipGetErrorString(err) + << std::endl; + assert(err == hipSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the HIP runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#if defined(EIGEN_HIPCC) + hipError_t error = hipStreamQuery(stream_->stream()); + return (error == hipSuccess) || (error == hipErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \ + assert(hipGetLastError() == hipSuccess); + + +// FIXME: Should be device and kernel specific. +#if defined(EIGEN_HIPCC) +static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t status = hipDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == hipSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0ffe68ab3..24a57970a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(EIGEN_CUDACC) +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -250,6 +250,17 @@ inline void TensorExecutor::run( TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { +#if defined(EIGEN_HIPCC) + const int block_size = device.maxHipThreadsPerBlock(); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size); +#else const int block_size = device.maxCudaThreadsPerBlock(); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; @@ -260,11 +271,12 @@ inline void TensorExecutor::run( LAUNCH_CUDA_KERNEL( (EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); +#endif } evaluator.cleanup(); } -#endif // EIGEN_CUDACC +#endif // EIGEN_CUDACC || EIGEN_HIPCC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c015ce196..b8f0bc798 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -109,7 +109,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 3209fecd3..835efbf72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple { namespace internal { -template size_t array_prod(const IndexList& sizes) { +template + #if defined(EIGEN_HIPCC) + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + #endif + size_t array_prod(const IndexList& sizes) { size_t result = 1; for (int i = 0; i < array_size >::value; ++i) { result *= sizes[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index c9e61f359..8e1ba486d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 5431eb740..de1075cc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16))) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e59074506..2a979845b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -858,7 +858,10 @@ struct TensorEvaluator __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template @@ -495,7 +495,11 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + EIGEN_STRONG_INLINE + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -694,9 +698,9 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); @@ -774,14 +778,22 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; +#if defined(EIGEN_HIPCC) + public: +#endif + // Evaluator for the input expression. TensorEvaluator m_impl; +#if defined(EIGEN_HIPCC) + private: +#endif + // Operation to apply for computing the reduction. Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h new file mode 100644 index 000000000..5304a22c5 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h @@ -0,0 +1,815 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + +#if defined(EIGEN_HIPCC) +#define HIP_WARP_SIZE 64 +#endif + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple hip thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another hip thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#if defined(EIGEN_HAS_HIP_FP16) +template